diff options
author | Thomas Koenig <tkoenig@gcc.gnu.org> | 2020-10-28 18:41:24 +0100 |
---|---|---|
committer | Thomas Koenig <tkoenig@gcc.gnu.org> | 2020-10-28 18:41:24 +0100 |
commit | bf6dad60c338a42a7fb85f7b2a5870c0fb2e20f8 (patch) | |
tree | e513781ef717465e7db0358e987a5a6cbef5665c /gcc/config | |
parent | 0c261d5b5c931d9e9214d06531bdc7e9e16aeaab (diff) | |
parent | 47d13acbda9a5d8eb57ff169ba74857cd54108e4 (diff) | |
download | gcc-bf6dad60c338a42a7fb85f7b2a5870c0fb2e20f8.zip gcc-bf6dad60c338a42a7fb85f7b2a5870c0fb2e20f8.tar.gz gcc-bf6dad60c338a42a7fb85f7b2a5870c0fb2e20f8.tar.bz2 |
Merge branch 'master' into devel/coarray_native.
Merge into devel/coarray_native to prepare for later merging of
coarray_native with master.
Diffstat (limited to 'gcc/config')
147 files changed, 6989 insertions, 3292 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 4f33dd9..732a4dc 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -2024,7 +2024,7 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target) return target; } -/* Expand an expression EXP as fpsr or cpsr setter (depending on +/* Expand an expression EXP as fpsr or fpcr setter (depending on UNSPEC) using MODE. */ static void aarch64_expand_fpsr_fpcr_setter (int unspec, machine_mode mode, tree exp) @@ -2034,6 +2034,18 @@ aarch64_expand_fpsr_fpcr_setter (int unspec, machine_mode mode, tree exp) emit_insn (gen_aarch64_set (unspec, mode, op)); } +/* Expand a fpsr or fpcr getter (depending on UNSPEC) using MODE. + Return the target. */ +static rtx +aarch64_expand_fpsr_fpcr_getter (enum insn_code icode, machine_mode mode, + rtx target) +{ + expand_operand op; + create_output_operand (&op, target, mode); + expand_insn (icode, 1, &op); + return op.value; +} + /* Expand an expression EXP that calls built-in function FCODE, with result going to TARGET if that's convenient. IGNORE is true if the result of the builtin is ignored. */ @@ -2048,26 +2060,26 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target, switch (fcode) { case AARCH64_BUILTIN_GET_FPCR: - emit_insn (gen_aarch64_get (UNSPECV_GET_FPCR, SImode, target)); - return target; + return aarch64_expand_fpsr_fpcr_getter (CODE_FOR_aarch64_get_fpcrsi, + SImode, target); case AARCH64_BUILTIN_SET_FPCR: aarch64_expand_fpsr_fpcr_setter (UNSPECV_SET_FPCR, SImode, exp); return target; case AARCH64_BUILTIN_GET_FPSR: - emit_insn (gen_aarch64_get (UNSPECV_GET_FPSR, SImode, target)); - return target; + return aarch64_expand_fpsr_fpcr_getter (CODE_FOR_aarch64_get_fpsrsi, + SImode, target); case AARCH64_BUILTIN_SET_FPSR: aarch64_expand_fpsr_fpcr_setter (UNSPECV_SET_FPSR, SImode, exp); return target; case AARCH64_BUILTIN_GET_FPCR64: - emit_insn (gen_aarch64_get (UNSPECV_GET_FPCR, DImode, target)); - return target; + return aarch64_expand_fpsr_fpcr_getter (CODE_FOR_aarch64_get_fpcrdi, + DImode, target); case AARCH64_BUILTIN_SET_FPCR64: aarch64_expand_fpsr_fpcr_setter (UNSPECV_SET_FPCR, DImode, exp); return target; case AARCH64_BUILTIN_GET_FPSR64: - emit_insn (gen_aarch64_get (UNSPECV_GET_FPSR, DImode, target)); - return target; + return aarch64_expand_fpsr_fpcr_getter (CODE_FOR_aarch64_get_fpsrdi, + DImode, target); case AARCH64_BUILTIN_SET_FPSR64: aarch64_expand_fpsr_fpcr_setter (UNSPECV_SET_FPSR, DImode, exp); return target; @@ -2079,20 +2091,13 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target, arg0 = CALL_EXPR_ARG (exp, 0); op0 = force_reg (Pmode, expand_normal (arg0)); - if (!target) - target = gen_reg_rtx (Pmode); - else - target = force_reg (Pmode, target); - - emit_move_insn (target, op0); - if (fcode == AARCH64_PAUTH_BUILTIN_XPACLRI) { rtx lr = gen_rtx_REG (Pmode, R30_REGNUM); icode = CODE_FOR_xpaclri; emit_move_insn (lr, op0); emit_insn (GEN_FCN (icode) ()); - emit_move_insn (target, lr); + return lr; } else { @@ -2122,20 +2127,18 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target, emit_move_insn (x17_reg, op0); emit_move_insn (x16_reg, op1); emit_insn (GEN_FCN (icode) ()); - emit_move_insn (target, x17_reg); + return x17_reg; } - return target; - case AARCH64_JSCVT: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = force_reg (DFmode, expand_normal (arg0)); - if (!target) - target = gen_reg_rtx (SImode); - else - target = force_reg (SImode, target); - emit_insn (GEN_FCN (CODE_FOR_aarch64_fjcvtzs) (target, op0)); - return target; + { + expand_operand ops[2]; + create_output_operand (&ops[0], target, SImode); + op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); + create_input_operand (&ops[1], op0, DFmode); + expand_insn (CODE_FOR_aarch64_fjcvtzs, 2, ops); + return ops[0].value; + } case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF: case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF: diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index f30ff35..3aa13f6 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -103,8 +103,11 @@ AARCH64_CORE("cortex-a75", cortexa75, cortexa57, 8_2A, AARCH64_FL_FOR_ARCH8_2 AARCH64_CORE("cortex-a76", cortexa76, cortexa57, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, neoversen1, 0x41, 0xd0b, -1) AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, neoversen1, 0x41, 0xd0e, -1) AARCH64_CORE("cortex-a77", cortexa77, cortexa57, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, neoversen1, 0x41, 0xd0d, -1) +AARCH64_CORE("cortex-a78", cortexa78, cortexa57, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd41, -1) +AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd42, -1) AARCH64_CORE("cortex-a65", cortexa65, cortexa53, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd06, -1) AARCH64_CORE("cortex-a65ae", cortexa65ae, cortexa53, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd43, -1) +AARCH64_CORE("cortex-x1", cortexx1, cortexa57, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd44, -1) AARCH64_CORE("ares", ares, cortexa57, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd0c, -1) AARCH64_CORE("neoverse-n1", neoversen1, cortexa57, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd0c, -1) AARCH64_CORE("neoverse-e1", neoversee1, cortexa53, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd4a, -1) @@ -133,11 +136,15 @@ AARCH64_CORE("thunderx3t110", thunderx3t110, thunderx3t110, 8_3A, AARCH64_FL_ /* ARMv8.4-A Architecture Processors. */ /* Arm ('A') cores. */ -AARCH64_CORE("zeus", zeus, cortexa57, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversen1, 0x41, 0xd40, -1) +AARCH64_CORE("zeus", zeus, cortexa57, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversev1, 0x41, 0xd40, -1) +AARCH64_CORE("neoverse-v1", neoversev1, cortexa57, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversev1, 0x41, 0xd40, -1) /* Qualcomm ('Q') cores. */ AARCH64_CORE("saphira", saphira, saphira, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira, 0x51, 0xC01, -1) +/* Armv8.5-A Architecture Processors. */ +AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, 8_5A, AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG, neoversen2, 0x41, 0xd49, -1) + /* ARMv8-A big.LITTLE implementations. */ AARCH64_CORE("cortex-a57.cortex-a53", cortexa57cortexa53, cortexa53, 8A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1) diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def index 8257df9..ca08642 100644 --- a/gcc/config/aarch64/aarch64-option-extensions.def +++ b/gcc/config/aarch64/aarch64-option-extensions.def @@ -155,7 +155,7 @@ AARCH64_OPT_EXTENSION("sve", AARCH64_FL_SVE, AARCH64_FL_FP | AARCH64_FL_SIMD | \ AARCH64_OPT_EXTENSION("profile", AARCH64_FL_PROFILE, 0, 0, false, "") /* Enabling/Disabling "rng" only changes "rng". */ -AARCH64_OPT_EXTENSION("rng", AARCH64_FL_RNG, 0, 0, false, "") +AARCH64_OPT_EXTENSION("rng", AARCH64_FL_RNG, 0, 0, false, "rng") /* Enabling/Disabling "memtag" only changes "memtag". */ AARCH64_OPT_EXTENSION("memtag", AARCH64_FL_MEMTAG, 0, 0, false, "") diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index c7e828d..7a34c84 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -136,6 +136,25 @@ enum aarch64_addr_query_type { ADDR_QUERY_ANY }; +/* Enumerates values that can be arbitrarily mixed into a calculation + in order to make the result of the calculation unique to its use case. + + AARCH64_SALT_SSP_SET + AARCH64_SALT_SSP_TEST + Used when calculating the address of the stack protection canary value. + There is a separate value for setting and testing the canary, meaning + that these two operations produce unique addresses: they are different + from each other, and from all other address calculations. + + The main purpose of this is to prevent the SET address being spilled + to the stack and reloaded for the TEST, since that would give an + attacker the opportunity to change the address of the expected + canary value. */ +enum aarch64_salt_type { + AARCH64_SALT_SSP_SET, + AARCH64_SALT_SSP_TEST +}; + /* A set of tuning parameters contains references to size and time cost models and vectors for address cost calculations, register move costs and memory move costs. */ @@ -608,9 +627,9 @@ opt_machine_mode aarch64_ptrue_all_mode (rtx); rtx aarch64_convert_sve_data_to_pred (rtx, machine_mode, rtx); rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx); void aarch64_expand_mov_immediate (rtx, rtx); +rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type); rtx aarch64_ptrue_reg (machine_mode); rtx aarch64_pfalse_reg (machine_mode); -bool aarch64_sve_pred_dominates_p (rtx *, rtx); bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *); void aarch64_emit_sve_pred_move (rtx, rtx, rtx); void aarch64_expand_sve_mem_move (rtx, rtx, machine_mode); diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index d1b2110..5bc596d 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -45,8 +45,8 @@ BUILTIN_VDC (COMBINE, combine, 0, ALL) VAR1 (COMBINEP, combine, 0, ALL, di) - BUILTIN_VB (BINOP, pmul, 0, ALL) - BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, ALL) + BUILTIN_VB (BINOP, pmul, 0, NONE) + BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP) BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL) BUILTIN_VD_BHSI (BINOP, addp, 0, NONE) VAR1 (UNOP, addp, 0, NONE, di) @@ -70,26 +70,26 @@ BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, ALL) /* Implemented by aarch64_get_dreg<VSTRUCT:mode><VDC:mode>. */ - BUILTIN_VDC (GETREG, get_dregoi, 0, ALL) - BUILTIN_VDC (GETREG, get_dregci, 0, ALL) - BUILTIN_VDC (GETREG, get_dregxi, 0, ALL) - VAR1 (GETREGP, get_dregoi, 0, ALL, di) - VAR1 (GETREGP, get_dregci, 0, ALL, di) - VAR1 (GETREGP, get_dregxi, 0, ALL, di) + BUILTIN_VDC (GETREG, get_dregoi, 0, AUTO_FP) + BUILTIN_VDC (GETREG, get_dregci, 0, AUTO_FP) + BUILTIN_VDC (GETREG, get_dregxi, 0, AUTO_FP) + VAR1 (GETREGP, get_dregoi, 0, AUTO_FP, di) + VAR1 (GETREGP, get_dregci, 0, AUTO_FP, di) + VAR1 (GETREGP, get_dregxi, 0, AUTO_FP, di) /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>. */ - BUILTIN_VQ (GETREG, get_qregoi, 0, ALL) - BUILTIN_VQ (GETREG, get_qregci, 0, ALL) - BUILTIN_VQ (GETREG, get_qregxi, 0, ALL) - VAR1 (GETREGP, get_qregoi, 0, ALL, v2di) - VAR1 (GETREGP, get_qregci, 0, ALL, v2di) - VAR1 (GETREGP, get_qregxi, 0, ALL, v2di) + BUILTIN_VQ (GETREG, get_qregoi, 0, AUTO_FP) + BUILTIN_VQ (GETREG, get_qregci, 0, AUTO_FP) + BUILTIN_VQ (GETREG, get_qregxi, 0, AUTO_FP) + VAR1 (GETREGP, get_qregoi, 0, AUTO_FP, v2di) + VAR1 (GETREGP, get_qregci, 0, AUTO_FP, v2di) + VAR1 (GETREGP, get_qregxi, 0, AUTO_FP, v2di) /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>. */ - BUILTIN_VQ (SETREG, set_qregoi, 0, ALL) - BUILTIN_VQ (SETREG, set_qregci, 0, ALL) - BUILTIN_VQ (SETREG, set_qregxi, 0, ALL) - VAR1 (SETREGP, set_qregoi, 0, ALL, v2di) - VAR1 (SETREGP, set_qregci, 0, ALL, v2di) - VAR1 (SETREGP, set_qregxi, 0, ALL, v2di) + BUILTIN_VQ (SETREG, set_qregoi, 0, AUTO_FP) + BUILTIN_VQ (SETREG, set_qregci, 0, AUTO_FP) + BUILTIN_VQ (SETREG, set_qregxi, 0, AUTO_FP) + VAR1 (SETREGP, set_qregoi, 0, AUTO_FP, v2di) + VAR1 (SETREGP, set_qregci, 0, AUTO_FP, v2di) + VAR1 (SETREGP, set_qregxi, 0, AUTO_FP, v2di) /* Implemented by aarch64_ld1x2<VQ:mode>. */ BUILTIN_VQ (LOADSTRUCT, ld1x2, 0, ALL) /* Implemented by aarch64_ld1x2<VDC:mode>. */ @@ -159,7 +159,7 @@ BUILTIN_VQN (TERNOP, raddhn2, 0, NONE) BUILTIN_VQN (TERNOP, rsubhn2, 0, NONE) - BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0, ALL) + BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, ALL) /* Implemented by aarch64_<sur>qmovn<mode>. */ BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, ALL) BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0, ALL) @@ -189,11 +189,11 @@ BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0, ALL) BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0, ALL) - BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0, ALL) - BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0, ALL) + BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0, NONE) + BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0, NONE) - BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, ALL) - BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, ALL) + BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, NONE) + BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, NONE) BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, ALL) BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, ALL) @@ -246,10 +246,10 @@ BUILTIN_VHSDF (BINOP, fcadd270, 0, FP) /* Implemented by aarch64_fcmla{_lane}{q}<rot><mode>. */ - BUILTIN_VHSDF (TERNOP, fcmla0, 0, ALL) - BUILTIN_VHSDF (TERNOP, fcmla90, 0, ALL) - BUILTIN_VHSDF (TERNOP, fcmla180, 0, ALL) - BUILTIN_VHSDF (TERNOP, fcmla270, 0, ALL) + BUILTIN_VHSDF (TERNOP, fcmla0, 0, FP) + BUILTIN_VHSDF (TERNOP, fcmla90, 0, FP) + BUILTIN_VHSDF (TERNOP, fcmla180, 0, FP) + BUILTIN_VHSDF (TERNOP, fcmla270, 0, FP) BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0, ALL) BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0, ALL) BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0, ALL) @@ -338,12 +338,11 @@ BUILTIN_VHSDF (UNOP, nearbyint, 2, FP) BUILTIN_VHSDF (UNOP, rint, 2, FP) BUILTIN_VHSDF (UNOP, round, 2, FP) - BUILTIN_VHSDF_DF (UNOP, frintn, 2, FP) + BUILTIN_VHSDF_HSDF (UNOP, frintn, 2, FP) VAR1 (UNOP, btrunc, 2, FP, hf) VAR1 (UNOP, ceil, 2, FP, hf) VAR1 (UNOP, floor, 2, FP, hf) - VAR1 (UNOP, frintn, 2, FP, hf) VAR1 (UNOP, nearbyint, 2, FP, hf) VAR1 (UNOP, rint, 2, FP, hf) VAR1 (UNOP, round, 2, FP, hf) @@ -535,8 +534,8 @@ VAR1 (TERNOPU, crypto_sha256su1, 0, ALL, v4si) /* Implemented by aarch64_crypto_pmull<mode>. */ - VAR1 (BINOPP, crypto_pmull, 0, ALL, di) - VAR1 (BINOPP, crypto_pmull, 0, ALL, v2di) + VAR1 (BINOPP, crypto_pmull, 0, NONE, di) + VAR1 (BINOPP, crypto_pmull, 0, NONE, v2di) /* Implemented by aarch64_tbl3<mode>. */ VAR1 (BINOP, tbl3, 0, ALL, v8qi) @@ -667,15 +666,15 @@ BUILTIN_VQ_I (TERNOP, bcaxq, 4, ALL) /* Implemented by aarch64_fml<f16mac1>l<f16quad>_low<mode>. */ - VAR1 (TERNOP, fmlal_low, 0, ALL, v2sf) - VAR1 (TERNOP, fmlsl_low, 0, ALL, v2sf) - VAR1 (TERNOP, fmlalq_low, 0, ALL, v4sf) - VAR1 (TERNOP, fmlslq_low, 0, ALL, v4sf) + VAR1 (TERNOP, fmlal_low, 0, FP, v2sf) + VAR1 (TERNOP, fmlsl_low, 0, FP, v2sf) + VAR1 (TERNOP, fmlalq_low, 0, FP, v4sf) + VAR1 (TERNOP, fmlslq_low, 0, FP, v4sf) /* Implemented by aarch64_fml<f16mac1>l<f16quad>_high<mode>. */ - VAR1 (TERNOP, fmlal_high, 0, ALL, v2sf) - VAR1 (TERNOP, fmlsl_high, 0, ALL, v2sf) - VAR1 (TERNOP, fmlalq_high, 0, ALL, v4sf) - VAR1 (TERNOP, fmlslq_high, 0, ALL, v4sf) + VAR1 (TERNOP, fmlal_high, 0, FP, v2sf) + VAR1 (TERNOP, fmlsl_high, 0, FP, v2sf) + VAR1 (TERNOP, fmlalq_high, 0, FP, v4sf) + VAR1 (TERNOP, fmlslq_high, 0, FP, v4sf) /* Implemented by aarch64_fml<f16mac1>l_lane_lowv2sf. */ VAR1 (QUADOP_LANE, fmlal_lane_low, 0, ALL, v2sf) VAR1 (QUADOP_LANE, fmlsl_lane_low, 0, ALL, v2sf) @@ -713,20 +712,20 @@ VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, ALL, v2sf, v4sf) /* Implemented by aarch64_bfmmlaqv4sf */ - VAR1 (TERNOP, bfmmlaq, 0, ALL, v4sf) + VAR1 (TERNOP, bfmmlaq, 0, AUTO_FP, v4sf) /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf */ - VAR1 (TERNOP, bfmlalb, 0, ALL, v4sf) - VAR1 (TERNOP, bfmlalt, 0, ALL, v4sf) + VAR1 (TERNOP, bfmlalb, 0, FP, v4sf) + VAR1 (TERNOP, bfmlalt, 0, FP, v4sf) VAR1 (QUADOP_LANE, bfmlalb_lane, 0, ALL, v4sf) VAR1 (QUADOP_LANE, bfmlalt_lane, 0, ALL, v4sf) VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf) VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf) /* Implemented by aarch64_simd_<sur>mmlav16qi. */ - VAR1 (TERNOP, simd_smmla, 0, ALL, v16qi) - VAR1 (TERNOPU, simd_ummla, 0, ALL, v16qi) - VAR1 (TERNOP_SSUS, simd_usmmla, 0, ALL, v16qi) + VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi) + VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi) + VAR1 (TERNOP_SSUS, simd_usmmla, 0, NONE, v16qi) /* Implemented by aarch64_bfcvtn{q}{2}<mode> */ VAR1 (UNOP, bfcvtn, 0, ALL, v4bf) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index cd79aba..31a8c5a 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -464,6 +464,95 @@ ;; ;; - MNEMONIC is the mnemonic of the associated SVE instruction. ;; +;; For (3) and (4), we combine these operations with an UNSPEC_SEL +;; that selects between the result of the FP operation and the "else" +;; value. (This else value is a merge input for _m ACLE functions +;; and zero for _z ACLE functions.) The outer pattern then has the form: +;; +;; (unspec [pred fp_operation else_value] UNSPEC_SEL) +;; +;; This means that the patterns for (3) and (4) have two predicates: +;; one for the FP operation itself and one for the UNSPEC_SEL. +;; This pattern is equivalent to the result of combining an instance +;; of (1) or (2) with a separate vcond instruction, so these patterns +;; are useful as combine targets too. +;; +;; However, in the combine case, the instructions that we want to +;; combine might use different predicates. Then: +;; +;; - Some of the active lanes of the FP operation might be discarded +;; by the UNSPEC_SEL. It's OK to drop the FP operation on those lanes, +;; even for SVE_STRICT_GP, since the operations on those lanes are +;; effectively dead code. +;; +;; - Some of the inactive lanes of the FP operation might be selected +;; by the UNSPEC_SEL, giving unspecified values for those lanes. +;; SVE_RELAXED_GP lets us extend the FP operation to cover these +;; extra lanes, but SVE_STRICT_GP does not. +;; +;; Thus SVE_RELAXED_GP allows us to ignore the predicate on the FP operation +;; and operate on exactly the lanes selected by the UNSPEC_SEL predicate. +;; This typically leads to patterns like: +;; +;; (unspec [(match_operand 1 "register_operand" "Upl") +;; (unspec [(match_operand N) +;; (const_int SVE_RELAXED_GP) +;; ...] +;; UNSPEC_COND_<MNEMONIC>) +;; ...]) +;; +;; where operand N is allowed to be anything. These instructions then +;; have rewrite rules to replace operand N with operand 1, which gives the +;; instructions a canonical form and means that the original operand N is +;; not kept live unnecessarily. +;; +;; In contrast, SVE_STRICT_GP only allows the UNSPEC_SEL predicate to be +;; a subset of the FP operation predicate. This case isn't interesting +;; for FP operations that have an all-true predicate, since such operations +;; use SVE_RELAXED_GP instead. And it is not possible for instruction +;; conditions to track the subset relationship for arbitrary registers. +;; So in practice, the only useful case for SVE_STRICT_GP is the one +;; in which the predicates match: +;; +;; (unspec [(match_operand 1 "register_operand" "Upl") +;; (unspec [(match_dup 1) +;; (const_int SVE_STRICT_GP) +;; ...] +;; UNSPEC_COND_<MNEMONIC>) +;; ...]) +;; +;; This pattern would also be correct for SVE_RELAXED_GP, but it would +;; be redundant with the one above. However, if the combine pattern +;; has multiple FP operations, using a match_operand allows combinations +;; of SVE_STRICT_GP and SVE_RELAXED_GP in the same operation, provided +;; that the predicates are the same: +;; +;; (unspec [(match_operand 1 "register_operand" "Upl") +;; (... +;; (unspec [(match_dup 1) +;; (match_operand:SI N "aarch64_sve_gp_strictness") +;; ...] +;; UNSPEC_COND_<MNEMONIC1>) +;; (unspec [(match_dup 1) +;; (match_operand:SI M "aarch64_sve_gp_strictness") +;; ...] +;; UNSPEC_COND_<MNEMONIC2>) ...) +;; ...]) +;; +;; The fully-relaxed version of this pattern is: +;; +;; (unspec [(match_operand 1 "register_operand" "Upl") +;; (... +;; (unspec [(match_operand:SI N) +;; (const_int SVE_RELAXED_GP) +;; ...] +;; UNSPEC_COND_<MNEMONIC1>) +;; (unspec [(match_operand:SI M) +;; (const_int SVE_RELAXED_GP) +;; ...] +;; UNSPEC_COND_<MNEMONIC2>) ...) +;; ...]) +;; ;; ------------------------------------------------------------------------- ;; ---- Note on FFR handling ;; ------------------------------------------------------------------------- @@ -3304,18 +3393,18 @@ ) ;; Predicated floating-point unary arithmetic, merging with the first input. -(define_insn_and_rewrite "*cond_<optab><mode>_2" +(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 3) - (match_operand:SI 4 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w")] SVE_COND_FP_UNARY) (match_dup 2)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[3], operands[1])" + "TARGET_SVE" "@ <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype> movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>" @@ -3326,6 +3415,24 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_<optab><mode>_2_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "0, w")] + SVE_COND_FP_UNARY) + (match_dup 2)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype> + movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated floating-point unary arithmetic, merging with an independent ;; value. ;; @@ -3334,20 +3441,18 @@ ;; which is handled above rather than here. Marking all the alternatives ;; as earlyclobber helps to make the instruction more regular to the ;; register allocator. -(define_insn_and_rewrite "*cond_<optab><mode>_any" +(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, ?&w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")] SVE_COND_FP_UNARY) (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] UNSPEC_SEL))] - "TARGET_SVE - && !rtx_equal_p (operands[2], operands[3]) - && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE && !rtx_equal_p (operands[2], operands[3])" "@ <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype> movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype> @@ -3359,6 +3464,25 @@ [(set_attr "movprfx" "*,yes,yes")] ) +(define_insn "*cond_<optab><mode>_any_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, ?&w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")] + SVE_COND_FP_UNARY) + (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] + UNSPEC_SEL))] + "TARGET_SVE && !rtx_equal_p (operands[2], operands[3])" + "@ + <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype> + movprfx\t%0, %3\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>" + [(set_attr "movprfx" "*,yes,yes")] +) + ;; ------------------------------------------------------------------------- ;; ---- [FP] Square root ;; ------------------------------------------------------------------------- @@ -4649,19 +4773,19 @@ ;; Predicated floating-point binary operations that take an integer as their ;; second operand, with inactive lanes coming from the first operand. -(define_insn_and_rewrite "*cond_<optab><mode>_2" +(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w") (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w")] SVE_COND_FP_BINARY_INT) (match_dup 2)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE" "@ <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" @@ -4672,24 +4796,41 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_<optab><mode>_2_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "0, w") + (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w")] + SVE_COND_FP_BINARY_INT) + (match_dup 2)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated floating-point binary operations that take an integer as ;; their second operand, with the values of inactive lanes being distinct ;; from the other inputs. -(define_insn_and_rewrite "*cond_<optab><mode>_any" +(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w") (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w, w, w")] SVE_COND_FP_BINARY_INT) (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")] UNSPEC_SEL))] - "TARGET_SVE - && !rtx_equal_p (operands[2], operands[4]) - && aarch64_sve_pred_dominates_p (&operands[5], operands[1])" + "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" "@ movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> @@ -4713,6 +4854,35 @@ [(set_attr "movprfx" "yes")] ) +(define_insn_and_rewrite "*cond_<optab><mode>_any_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w") + (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w, w, w")] + SVE_COND_FP_BINARY_INT) + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")] + UNSPEC_SEL))] + "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" + "@ + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + #" + "&& reload_completed + && register_operand (operands[4], <MODE>mode) + && !rtx_equal_p (operands[0], operands[4])" + { + emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2], + operands[4], operands[1])); + operands[4] = operands[2] = operands[0]; + } + [(set_attr "movprfx" "yes")] +) + ;; ------------------------------------------------------------------------- ;; ---- [FP] General binary arithmetic corresponding to rtx codes ;; ------------------------------------------------------------------------- @@ -4813,19 +4983,19 @@ ) ;; Predicated floating-point operations, merging with the first input. -(define_insn_and_rewrite "*cond_<optab><mode>_2" +(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w")] SVE_COND_FP_BINARY) (match_dup 2)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE" "@ <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" @@ -4836,20 +5006,39 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_<optab><mode>_2_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "0, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w")] + SVE_COND_FP_BINARY) + (match_dup 2)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" + [(set_attr "movprfx" "*,yes")] +) + ;; Same for operations that take a 1-bit constant. -(define_insn_and_rewrite "*cond_<optab><mode>_2_const" +(define_insn_and_rewrite "*cond_<optab><mode>_2_const_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w") (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] SVE_COND_FP_BINARY_I1) (match_dup 2)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE" "@ <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3" @@ -4860,20 +5049,39 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_<optab><mode>_2_const_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "0, w") + (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] + SVE_COND_FP_BINARY_I1) + (match_dup 2)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 + movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated floating-point operations, merging with the second input. -(define_insn_and_rewrite "*cond_<optab><mode>_3" +(define_insn_and_rewrite "*cond_<optab><mode>_3_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w") (match_operand:SVE_FULL_F 3 "register_operand" "0, w")] SVE_COND_FP_BINARY) (match_dup 3)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE" "@ <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype> movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>" @@ -4884,14 +5092,33 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_<optab><mode>_3_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w") + (match_operand:SVE_FULL_F 3 "register_operand" "0, w")] + SVE_COND_FP_BINARY) + (match_dup 3)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype> + movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated floating-point operations, merging with an independent value. -(define_insn_and_rewrite "*cond_<optab><mode>_any" +(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")] SVE_COND_FP_BINARY) @@ -4899,8 +5126,7 @@ UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4]) - && !rtx_equal_p (operands[3], operands[4]) - && aarch64_sve_pred_dominates_p (&operands[5], operands[1])" + && !rtx_equal_p (operands[3], operands[4])" "@ movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype> @@ -4925,22 +5151,52 @@ [(set_attr "movprfx" "yes")] ) +(define_insn_and_rewrite "*cond_<optab><mode>_any_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")] + SVE_COND_FP_BINARY) + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")] + UNSPEC_SEL))] + "TARGET_SVE + && !rtx_equal_p (operands[2], operands[4]) + && !rtx_equal_p (operands[3], operands[4])" + "@ + movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + #" + "&& reload_completed + && register_operand (operands[4], <MODE>mode) + && !rtx_equal_p (operands[0], operands[4])" + { + emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2], + operands[4], operands[1])); + operands[4] = operands[2] = operands[0]; + } + [(set_attr "movprfx" "yes")] +) + ;; Same for operations that take a 1-bit constant. -(define_insn_and_rewrite "*cond_<optab><mode>_any_const" +(define_insn_and_rewrite "*cond_<optab><mode>_any_const_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w") (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] SVE_COND_FP_BINARY_I1) (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")] UNSPEC_SEL))] - "TARGET_SVE - && !rtx_equal_p (operands[2], operands[4]) - && aarch64_sve_pred_dominates_p (&operands[5], operands[1])" + "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" "@ movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 @@ -4963,6 +5219,34 @@ [(set_attr "movprfx" "yes")] ) +(define_insn_and_rewrite "*cond_<optab><mode>_any_const_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w") + (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] + SVE_COND_FP_BINARY_I1) + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")] + UNSPEC_SEL))] + "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" + "@ + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 + movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 + #" + "&& reload_completed + && register_operand (operands[4], <MODE>mode) + && !rtx_equal_p (operands[0], operands[4])" + { + emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2], + operands[4], operands[1])); + operands[4] = operands[2] = operands[0]; + } + [(set_attr "movprfx" "yes")] +) + ;; ------------------------------------------------------------------------- ;; ---- [FP] Addition ;; ------------------------------------------------------------------------- @@ -5001,19 +5285,19 @@ ;; Predicated floating-point addition of a constant, merging with the ;; first input. -(define_insn_and_rewrite "*cond_add<mode>_2_const" +(define_insn_and_rewrite "*cond_add<mode>_2_const_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w, ?w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, 0, w, w") (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN")] UNSPEC_COND_FADD) (match_dup 2)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE" "@ fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3 @@ -5026,23 +5310,42 @@ [(set_attr "movprfx" "*,*,yes,yes")] ) +(define_insn "*cond_add<mode>_2_const_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w, ?w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "0, 0, w, w") + (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN")] + UNSPEC_COND_FADD) + (match_dup 2)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 + fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3 + movprfx\t%0, %2\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 + movprfx\t%0, %2\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3" + [(set_attr "movprfx" "*,*,yes,yes")] +) + ;; Predicated floating-point addition of a constant, merging with an ;; independent value. -(define_insn_and_rewrite "*cond_add<mode>_any_const" +(define_insn_and_rewrite "*cond_add<mode>_any_const_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?w, ?w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w, w, w") (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN, vsA, vsN")] UNSPEC_COND_FADD) (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, 0, w, w")] UNSPEC_SEL))] - "TARGET_SVE - && !rtx_equal_p (operands[2], operands[4]) - && aarch64_sve_pred_dominates_p (&operands[5], operands[1])" + "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" "@ movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3 @@ -5068,6 +5371,37 @@ [(set_attr "movprfx" "yes")] ) +(define_insn_and_rewrite "*cond_add<mode>_any_const_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?w, ?w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w, w, w") + (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN, vsA, vsN")] + UNSPEC_COND_FADD) + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, 0, w, w")] + UNSPEC_SEL))] + "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" + "@ + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3 + movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 + movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3 + # + #" + "&& reload_completed + && register_operand (operands[4], <MODE>mode) + && !rtx_equal_p (operands[0], operands[4])" + { + emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2], + operands[4], operands[1])); + operands[4] = operands[2] = operands[0]; + } + [(set_attr "movprfx" "yes")] +) + ;; Register merging forms are handled through SVE_COND_FP_BINARY. ;; ------------------------------------------------------------------------- @@ -5110,19 +5444,19 @@ ) ;; Predicated FCADD, merging with the first input. -(define_insn_and_rewrite "*cond_<optab><mode>_2" +(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w")] SVE_COND_FCADD) (match_dup 2)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE" "@ fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot> movprfx\t%0, %2\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>" @@ -5133,22 +5467,39 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_<optab><mode>_2_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "0, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w")] + SVE_COND_FCADD) + (match_dup 2)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot> + movprfx\t%0, %2\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated FCADD, merging with an independent value. -(define_insn_and_rewrite "*cond_<optab><mode>_any" +(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, 0, w, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")] SVE_COND_FCADD) (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")] UNSPEC_SEL))] - "TARGET_SVE - && !rtx_equal_p (operands[2], operands[4]) - && aarch64_sve_pred_dominates_p (&operands[5], operands[1])" + "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" "@ movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot> movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot> @@ -5172,6 +5523,35 @@ [(set_attr "movprfx" "yes")] ) +(define_insn_and_rewrite "*cond_<optab><mode>_any_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, 0, w, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")] + SVE_COND_FCADD) + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")] + UNSPEC_SEL))] + "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" + "@ + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot> + movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot> + movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot> + #" + "&& reload_completed + && register_operand (operands[4], <MODE>mode) + && !rtx_equal_p (operands[0], operands[4])" + { + emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2], + operands[4], operands[1])); + operands[4] = operands[2] = operands[0]; + } + [(set_attr "movprfx" "yes")] +) + ;; ------------------------------------------------------------------------- ;; ---- [FP] Subtraction ;; ------------------------------------------------------------------------- @@ -5209,19 +5589,19 @@ ;; Predicated floating-point subtraction from a constant, merging with the ;; second input. -(define_insn_and_rewrite "*cond_sub<mode>_3_const" +(define_insn_and_rewrite "*cond_sub<mode>_3_const_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") (match_operand:SVE_FULL_F 3 "register_operand" "0, w")] UNSPEC_COND_FSUB) (match_dup 3)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE" "@ fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2 movprfx\t%0, %3\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2" @@ -5232,12 +5612,28 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_sub<mode>_3_const_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") + (match_operand:SVE_FULL_F 3 "register_operand" "0, w")] + UNSPEC_COND_FSUB) + (match_dup 3)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2 + movprfx\t%0, %3\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated floating-point subtraction from a constant, merging with an ;; independent value. -;; -;; The subtraction predicate and the merge predicate are allowed to be -;; different. -(define_insn_and_rewrite "*cond_sub<mode>_relaxed_const" +(define_insn_and_rewrite "*cond_sub<mode>_const_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") @@ -5272,11 +5668,7 @@ [(set_attr "movprfx" "yes")] ) -;; Predicated floating-point subtraction from a constant, merging with an -;; independent value. -;; -;; The subtraction predicate and the merge predicate must be the same. -(define_insn_and_rewrite "*cond_sub<mode>_strict_const" +(define_insn_and_rewrite "*cond_sub<mode>_const_strict" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") @@ -5329,19 +5721,19 @@ ) ;; Predicated floating-point absolute difference. -(define_insn_and_rewrite "*aarch64_pred_abd<mode>" +(define_insn_and_rewrite "*aarch64_pred_abd<mode>_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (match_operand:SI 4 "aarch64_sve_gp_strictness") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "%0, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w")] UNSPEC_COND_FSUB)] UNSPEC_COND_FABS))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])" + "TARGET_SVE" "@ fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" @@ -5352,6 +5744,25 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*aarch64_pred_abd<mode>_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (match_operand:SI 4 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "%0, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w")] + UNSPEC_COND_FSUB)] + UNSPEC_COND_FABS))] + "TARGET_SVE" + "@ + fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" + [(set_attr "movprfx" "*,yes")] +) + (define_expand "@aarch64_cond_abd<mode>" [(set (match_operand:SVE_FULL_F 0 "register_operand") (unspec:SVE_FULL_F @@ -5376,82 +5787,124 @@ ;; Predicated floating-point absolute difference, merging with the first ;; input. -(define_insn_and_rewrite "*aarch64_cond_abd<mode>_2" +(define_insn_and_rewrite "*aarch64_cond_abd<mode>_2_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (unspec:SVE_FULL_F - [(match_operand 6) - (match_operand:SI 7 "aarch64_sve_gp_strictness") + [(match_operand 5) + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w")] UNSPEC_COND_FSUB)] UNSPEC_COND_FABS) (match_dup 2)] UNSPEC_SEL))] - "TARGET_SVE - && aarch64_sve_pred_dominates_p (&operands[4], operands[1]) - && aarch64_sve_pred_dominates_p (&operands[6], operands[1])" + "TARGET_SVE" "@ fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" "&& (!rtx_equal_p (operands[1], operands[4]) - || !rtx_equal_p (operands[1], operands[6]))" + || !rtx_equal_p (operands[1], operands[5]))" { operands[4] = copy_rtx (operands[1]); - operands[6] = copy_rtx (operands[1]); + operands[5] = copy_rtx (operands[1]); } [(set_attr "movprfx" "*,yes")] ) +(define_insn "*aarch64_cond_abd<mode>_2_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (match_operand:SI 4 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_dup 1) + (match_operand:SI 5 "aarch64_sve_gp_strictness") + (match_operand:SVE_FULL_F 2 "register_operand" "0, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w")] + UNSPEC_COND_FSUB)] + UNSPEC_COND_FABS) + (match_dup 2)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated floating-point absolute difference, merging with the second ;; input. -(define_insn_and_rewrite "*aarch64_cond_abd<mode>_3" +(define_insn_and_rewrite "*aarch64_cond_abd<mode>_3_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (unspec:SVE_FULL_F - [(match_operand 6) - (match_operand:SI 7 "aarch64_sve_gp_strictness") + [(match_operand 5) + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w") (match_operand:SVE_FULL_F 3 "register_operand" "0, w")] UNSPEC_COND_FSUB)] UNSPEC_COND_FABS) (match_dup 3)] UNSPEC_SEL))] - "TARGET_SVE - && aarch64_sve_pred_dominates_p (&operands[4], operands[1]) - && aarch64_sve_pred_dominates_p (&operands[6], operands[1])" + "TARGET_SVE" "@ fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype> movprfx\t%0, %3\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>" "&& (!rtx_equal_p (operands[1], operands[4]) - || !rtx_equal_p (operands[1], operands[6]))" + || !rtx_equal_p (operands[1], operands[5]))" { operands[4] = copy_rtx (operands[1]); - operands[6] = copy_rtx (operands[1]); + operands[5] = copy_rtx (operands[1]); } [(set_attr "movprfx" "*,yes")] ) +(define_insn "*aarch64_cond_abd<mode>_3_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (match_operand:SI 4 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_dup 1) + (match_operand:SI 5 "aarch64_sve_gp_strictness") + (match_operand:SVE_FULL_F 2 "register_operand" "w, w") + (match_operand:SVE_FULL_F 3 "register_operand" "0, w")] + UNSPEC_COND_FSUB)] + UNSPEC_COND_FABS) + (match_dup 3)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype> + movprfx\t%0, %3\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated floating-point absolute difference, merging with an ;; independent value. -(define_insn_and_rewrite "*aarch64_cond_abd<mode>_any" +(define_insn_and_rewrite "*aarch64_cond_abd<mode>_any_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (unspec:SVE_FULL_F - [(match_operand 7) - (match_operand:SI 8 "aarch64_sve_gp_strictness") + [(match_operand 6) + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")] UNSPEC_COND_FSUB)] @@ -5460,9 +5913,7 @@ UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4]) - && !rtx_equal_p (operands[3], operands[4]) - && aarch64_sve_pred_dominates_p (&operands[5], operands[1]) - && aarch64_sve_pred_dominates_p (&operands[7], operands[1])" + && !rtx_equal_p (operands[3], operands[4])" "@ movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype> @@ -5472,18 +5923,18 @@ "&& 1" { if (reload_completed - && register_operand (operands[4], <MODE>mode) - && !rtx_equal_p (operands[0], operands[4])) + && register_operand (operands[4], <MODE>mode) + && !rtx_equal_p (operands[0], operands[4])) { emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[3], operands[4], operands[1])); operands[4] = operands[3] = operands[0]; } else if (!rtx_equal_p (operands[1], operands[5]) - || !rtx_equal_p (operands[1], operands[7])) + || !rtx_equal_p (operands[1], operands[6])) { operands[5] = copy_rtx (operands[1]); - operands[7] = copy_rtx (operands[1]); + operands[6] = copy_rtx (operands[1]); } else FAIL; @@ -5491,6 +5942,42 @@ [(set_attr "movprfx" "yes")] ) +(define_insn_and_rewrite "*aarch64_cond_abd<mode>_any_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (match_operand:SI 5 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_dup 1) + (match_operand:SI 6 "aarch64_sve_gp_strictness") + (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")] + UNSPEC_COND_FSUB)] + UNSPEC_COND_FABS) + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")] + UNSPEC_SEL))] + "TARGET_SVE + && !rtx_equal_p (operands[2], operands[4]) + && !rtx_equal_p (operands[3], operands[4])" + "@ + movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + #" + "&& reload_completed + && register_operand (operands[4], <MODE>mode) + && !rtx_equal_p (operands[0], operands[4])" + { + emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[3], + operands[4], operands[1])); + operands[4] = operands[3] = operands[0]; + } + [(set_attr "movprfx" "yes")] +) + ;; ------------------------------------------------------------------------- ;; ---- [FP] Multiplication ;; ------------------------------------------------------------------------- @@ -6416,20 +6903,20 @@ ;; Predicated floating-point ternary operations, merging with the ;; first input. -(define_insn_and_rewrite "*cond_<optab><mode>_2" +(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "0, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w") (match_operand:SVE_FULL_F 4 "register_operand" "w, w")] SVE_COND_FP_TERNARY) (match_dup 2)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])" + "TARGET_SVE" "@ <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype> movprfx\t%0, %2\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>" @@ -6440,22 +6927,42 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_<optab><mode>_2_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "0, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w") + (match_operand:SVE_FULL_F 4 "register_operand" "w, w")] + SVE_COND_FP_TERNARY) + (match_dup 2)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype> + movprfx\t%0, %2\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated floating-point ternary operations, merging with the ;; third input. -(define_insn_and_rewrite "*cond_<optab><mode>_4" +(define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w") (match_operand:SVE_FULL_F 4 "register_operand" "0, w")] SVE_COND_FP_TERNARY) (match_dup 4)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])" + "TARGET_SVE" "@ <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype> movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>" @@ -6466,15 +6973,35 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_<optab><mode>_4_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w") + (match_operand:SVE_FULL_F 4 "register_operand" "0, w")] + SVE_COND_FP_TERNARY) + (match_dup 4)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype> + movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated floating-point ternary operations, merging with an ;; independent value. -(define_insn_and_rewrite "*cond_<optab><mode>_any" +(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 6) - (match_operand:SI 7 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w, 0, w, w, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, 0, w, w") (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w, w, w")] @@ -6484,8 +7011,7 @@ "TARGET_SVE && !rtx_equal_p (operands[2], operands[5]) && !rtx_equal_p (operands[3], operands[5]) - && !rtx_equal_p (operands[4], operands[5]) - && aarch64_sve_pred_dominates_p (&operands[6], operands[1])" + && !rtx_equal_p (operands[4], operands[5])" "@ movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype> movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype> @@ -6511,6 +7037,41 @@ [(set_attr "movprfx" "yes")] ) +(define_insn_and_rewrite "*cond_<optab><mode>_any_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w, 0, w, w, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, 0, w, w") + (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w, w, w")] + SVE_COND_FP_TERNARY) + (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")] + UNSPEC_SEL))] + "TARGET_SVE + && !rtx_equal_p (operands[2], operands[5]) + && !rtx_equal_p (operands[3], operands[5]) + && !rtx_equal_p (operands[4], operands[5])" + "@ + movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype> + movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype> + #" + "&& reload_completed + && register_operand (operands[5], <MODE>mode) + && !rtx_equal_p (operands[0], operands[5])" + { + emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4], + operands[5], operands[1])); + operands[5] = operands[4] = operands[0]; + } + [(set_attr "movprfx" "yes")] +) + ;; Unpredicated FMLA and FMLS by selected lanes. It doesn't seem worth using ;; (fma ...) since target-independent code won't understand the indexing. (define_insn "@aarch64_<optab>_lane_<mode>" @@ -6572,20 +7133,20 @@ ) ;; Predicated FCMLA, merging with the third input. -(define_insn_and_rewrite "*cond_<optab><mode>_4" +(define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w") (match_operand:SVE_FULL_F 4 "register_operand" "0, w")] SVE_COND_FCMLA) (match_dup 4)] UNSPEC_SEL))] - "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])" + "TARGET_SVE" "@ fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot> movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>" @@ -6596,23 +7157,41 @@ [(set_attr "movprfx" "*,yes")] ) +(define_insn "*cond_<optab><mode>_4_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w") + (match_operand:SVE_FULL_F 4 "register_operand" "0, w")] + SVE_COND_FCMLA) + (match_dup 4)] + UNSPEC_SEL))] + "TARGET_SVE" + "@ + fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot> + movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>" + [(set_attr "movprfx" "*,yes")] +) + ;; Predicated FCMLA, merging with an independent value. -(define_insn_and_rewrite "*cond_<optab><mode>_any" +(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w") (unspec:SVE_FULL_F [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 6) - (match_operand:SI 7 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w") (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w") (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w")] SVE_COND_FCMLA) (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")] UNSPEC_SEL))] - "TARGET_SVE - && !rtx_equal_p (operands[4], operands[5]) - && aarch64_sve_pred_dominates_p (&operands[6], operands[1])" + "TARGET_SVE && !rtx_equal_p (operands[4], operands[5])" "@ movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot> movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot> @@ -6636,6 +7215,36 @@ [(set_attr "movprfx" "yes")] ) +(define_insn_and_rewrite "*cond_<optab><mode>_any_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w") + (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w") + (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w")] + SVE_COND_FCMLA) + (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")] + UNSPEC_SEL))] + "TARGET_SVE && !rtx_equal_p (operands[4], operands[5])" + "@ + movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot> + movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot> + movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot> + #" + "&& reload_completed + && register_operand (operands[5], <MODE>mode) + && !rtx_equal_p (operands[0], operands[5])" + { + emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4], + operands[5], operands[1])); + operands[5] = operands[4] = operands[0]; + } + [(set_attr "movprfx" "yes")] +) + ;; Unpredicated FCMLA with indexing. (define_insn "@aarch64_<optab>_lane_<mode>" [(set (match_operand:SVE_FULL_HSF 0 "register_operand" "=w, ?&w") @@ -7328,34 +7937,52 @@ "TARGET_SVE" ) -(define_insn_and_rewrite "*aarch64_pred_fac<cmp_op><mode>" +(define_insn_and_rewrite "*aarch64_pred_fac<cmp_op><mode>_relaxed" [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") (unspec:<VPRED> [(match_operand:<VPRED> 1 "register_operand" "Upl") (match_operand:SI 4 "aarch64_sve_ptrue_flag") (unspec:SVE_FULL_F [(match_operand 5) - (match_operand:SI 6 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w")] UNSPEC_COND_FABS) (unspec:SVE_FULL_F - [(match_operand 7) - (match_operand:SI 8 "aarch64_sve_gp_strictness") + [(match_operand 6) + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 3 "register_operand" "w")] UNSPEC_COND_FABS)] SVE_COND_FP_ABS_CMP))] - "TARGET_SVE - && aarch64_sve_pred_dominates_p (&operands[5], operands[1]) - && aarch64_sve_pred_dominates_p (&operands[7], operands[1])" + "TARGET_SVE" "fac<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>" "&& (!rtx_equal_p (operands[1], operands[5]) - || !rtx_equal_p (operands[1], operands[7]))" + || !rtx_equal_p (operands[1], operands[6]))" { operands[5] = copy_rtx (operands[1]); - operands[7] = copy_rtx (operands[1]); + operands[6] = copy_rtx (operands[1]); } ) +(define_insn "*aarch64_pred_fac<cmp_op><mode>_strict" + [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") + (unspec:<VPRED> + [(match_operand:<VPRED> 1 "register_operand" "Upl") + (match_operand:SI 4 "aarch64_sve_ptrue_flag") + (unspec:SVE_FULL_F + [(match_dup 1) + (match_operand:SI 5 "aarch64_sve_gp_strictness") + (match_operand:SVE_FULL_F 2 "register_operand" "w")] + UNSPEC_COND_FABS) + (unspec:SVE_FULL_F + [(match_dup 1) + (match_operand:SI 6 "aarch64_sve_gp_strictness") + (match_operand:SVE_FULL_F 3 "register_operand" "w")] + UNSPEC_COND_FABS)] + SVE_COND_FP_ABS_CMP))] + "TARGET_SVE" + "fac<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>" +) + ;; ------------------------------------------------------------------------- ;; ---- [PRED] Select ;; ------------------------------------------------------------------------- @@ -7937,20 +8564,18 @@ ;; the same register (despite having different modes). Making all the ;; alternatives earlyclobber makes things more consistent for the ;; register allocator. -(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>" +(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>_relaxed" [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=&w, &w, ?&w") (unspec:SVE_FULL_HSDI [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl") (unspec:SVE_FULL_HSDI [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")] SVE_COND_FCVTI) (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] UNSPEC_SEL))] - "TARGET_SVE - && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits> - && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>" "@ fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype> movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype> @@ -7962,6 +8587,25 @@ [(set_attr "movprfx" "*,yes,yes")] ) +(define_insn "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>_strict" + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=&w, &w, ?&w") + (unspec:SVE_FULL_HSDI + [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl") + (unspec:SVE_FULL_HSDI + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")] + SVE_COND_FCVTI) + (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] + UNSPEC_SEL))] + "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>" + "@ + fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype> + movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype> + movprfx\t%0, %3\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>" + [(set_attr "movprfx" "*,yes,yes")] +) + ;; Predicated narrowing float-to-integer conversion with merging. (define_expand "@cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>" [(set (match_operand:VNx4SI_ONLY 0 "register_operand") @@ -8101,20 +8745,18 @@ ;; the same register (despite having different modes). Making all the ;; alternatives earlyclobber makes things more consistent for the ;; register allocator. -(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>" +(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>_relaxed" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, ?&w") (unspec:SVE_FULL_F [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl") (unspec:SVE_FULL_F [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")] SVE_COND_ICVTF) (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] UNSPEC_SEL))] - "TARGET_SVE - && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits> - && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>" "@ <su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype> movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype> @@ -8126,6 +8768,25 @@ [(set_attr "movprfx" "*,yes,yes")] ) +(define_insn "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>_strict" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, ?&w") + (unspec:SVE_FULL_F + [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl") + (unspec:SVE_FULL_F + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")] + SVE_COND_ICVTF) + (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] + UNSPEC_SEL))] + "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>" + "@ + <su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype> + movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype> + movprfx\t%0, %3\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>" + [(set_attr "movprfx" "*,yes,yes")] +) + ;; Predicated widening integer-to-float conversion with merging. (define_expand "@cond_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>" [(set (match_operand:VNx2DF_ONLY 0 "register_operand") diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index e18b9fe..0cafd0b 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -1890,18 +1890,18 @@ ) ;; These instructions do not take MOVPRFX. -(define_insn_and_rewrite "*cond_<sve_fp_op><mode>" +(define_insn_and_rewrite "*cond_<sve_fp_op><mode>_relaxed" [(set (match_operand:SVE_FULL_SDF 0 "register_operand" "=w") (unspec:SVE_FULL_SDF [(match_operand:<VPRED> 1 "register_operand" "Upl") (unspec:SVE_FULL_SDF [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:<VNARROW> 2 "register_operand" "w")] SVE2_COND_FP_UNARY_LONG) (match_operand:SVE_FULL_SDF 3 "register_operand" "0")] UNSPEC_SEL))] - "TARGET_SVE2 && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE2" "<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Ventype>" "&& !rtx_equal_p (operands[1], operands[4])" { @@ -1909,6 +1909,21 @@ } ) +(define_insn "*cond_<sve_fp_op><mode>_strict" + [(set (match_operand:SVE_FULL_SDF 0 "register_operand" "=w") + (unspec:SVE_FULL_SDF + [(match_operand:<VPRED> 1 "register_operand" "Upl") + (unspec:SVE_FULL_SDF + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:<VNARROW> 2 "register_operand" "w")] + SVE2_COND_FP_UNARY_LONG) + (match_operand:SVE_FULL_SDF 3 "register_operand" "0")] + UNSPEC_SEL))] + "TARGET_SVE2" + "<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Ventype>" +) + ;; ------------------------------------------------------------------------- ;; ---- [FP<-FP] Narrowing conversions ;; ------------------------------------------------------------------------- @@ -1963,20 +1978,18 @@ "TARGET_SVE2" ) -(define_insn_and_rewrite "*cond_<sve_fp_op><mode>_any" +(define_insn_and_rewrite "*cond_<sve_fp_op><mode>_any_relaxed" [(set (match_operand:VNx4SF_ONLY 0 "register_operand" "=&w, &w, &w") (unspec:VNx4SF_ONLY [(match_operand:<VWIDE_PRED> 1 "register_operand" "Upl, Upl, Upl") (unspec:VNx4SF_ONLY [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:<VWIDE> 2 "register_operand" "w, w, w")] SVE2_COND_FP_UNARY_NARROWB) (match_operand:VNx4SF_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] UNSPEC_SEL))] - "TARGET_SVE2 - && !rtx_equal_p (operands[2], operands[3]) - && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE2 && !rtx_equal_p (operands[2], operands[3])" "@ <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype> movprfx\t%0.<Vewtype>, %1/z, %2.<Vewtype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype> @@ -1988,6 +2001,25 @@ [(set_attr "movprfx" "*,yes,yes")] ) +(define_insn "*cond_<sve_fp_op><mode>_any_strict" + [(set (match_operand:VNx4SF_ONLY 0 "register_operand" "=&w, &w, &w") + (unspec:VNx4SF_ONLY + [(match_operand:<VWIDE_PRED> 1 "register_operand" "Upl, Upl, Upl") + (unspec:VNx4SF_ONLY + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:<VWIDE> 2 "register_operand" "w, w, w")] + SVE2_COND_FP_UNARY_NARROWB) + (match_operand:VNx4SF_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] + UNSPEC_SEL))] + "TARGET_SVE2 && !rtx_equal_p (operands[2], operands[3])" + "@ + <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype> + movprfx\t%0.<Vewtype>, %1/z, %2.<Vewtype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype> + movprfx\t%0, %3\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype>" + [(set_attr "movprfx" "*,yes,yes")] +) + ;; Predicated FCVTXNT. This doesn't give a natural aarch64_pred_*/cond_* ;; pair because the even elements always have to be supplied for active ;; elements, even if the inactive elements don't matter. @@ -2113,14 +2145,12 @@ [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") (unspec:<V_INT_EQUIV> [(match_operand 4) - (match_operand:SI 5 "aarch64_sve_gp_strictness") + (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")] SVE2_COND_INT_UNARY_FP) (match_operand:<V_INT_EQUIV> 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] UNSPEC_SEL))] - "TARGET_SVE2 - && !rtx_equal_p (operands[2], operands[3]) - && aarch64_sve_pred_dominates_p (&operands[4], operands[1])" + "TARGET_SVE2 && !rtx_equal_p (operands[2], operands[3])" "@ <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype> movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype> @@ -2132,6 +2162,25 @@ [(set_attr "movprfx" "*,yes,yes")] ) +(define_insn "*cond_<sve_fp_op><mode>_strict" + [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=&w, ?&w, ?&w") + (unspec:<V_INT_EQUIV> + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") + (unspec:<V_INT_EQUIV> + [(match_dup 1) + (const_int SVE_STRICT_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")] + SVE2_COND_INT_UNARY_FP) + (match_operand:<V_INT_EQUIV> 3 "aarch64_simd_reg_or_zero" "0, Dz, w")] + UNSPEC_SEL))] + "TARGET_SVE2 && !rtx_equal_p (operands[2], operands[3])" + "@ + <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype> + movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype> + movprfx\t%0, %3\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>" + [(set_attr "movprfx" "*,yes,yes")] +) + ;; ------------------------------------------------------------------------- ;; ---- [INT] Polynomial multiplication ;; ------------------------------------------------------------------------- diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md index 0e3239c..e060302 100644 --- a/gcc/config/aarch64/aarch64-tune.md +++ b/gcc/config/aarch64/aarch64-tune.md @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from aarch64-cores.def (define_attr "tune" - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82" + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82" (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b251f39..a8cc545 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1336,6 +1336,58 @@ static const struct tune_params neoversen1_tunings = &generic_prefetch_tune }; +static const struct tune_params neoversev1_tunings = +{ + &cortexa57_extra_costs, + &generic_addrcost_table, + &generic_regmove_cost, + &cortexa57_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_256, /* sve_width */ + 4, /* memmov_cost */ + 3, /* issue_rate */ + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ + "32:16", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &generic_prefetch_tune +}; + +static const struct tune_params neoversen2_tunings = +{ + &cortexa57_extra_costs, + &generic_addrcost_table, + &generic_regmove_cost, + &cortexa57_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_128, /* sve_width */ + 4, /* memmov_cost */ + 3, /* issue_rate */ + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ + "32:16", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &generic_prefetch_tune +}; + static const struct tune_params a64fx_tunings = { &generic_extra_costs, @@ -1935,6 +1987,29 @@ aarch64_sve_abi (void) return sve_abi; } +/* If X is an UNSPEC_SALT_ADDR expression, return the address that it + wraps, otherwise return X itself. */ + +static rtx +strip_salt (rtx x) +{ + rtx search = x; + if (GET_CODE (search) == CONST) + search = XEXP (search, 0); + if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR) + x = XVECEXP (search, 0, 0); + return x; +} + +/* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the + expression. */ + +static rtx +strip_offset_and_salt (rtx addr, poly_int64 *offset) +{ + return strip_salt (strip_offset (addr, offset)); +} + /* Generate code to enable conditional branches in functions over 1 MiB. */ const char * aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest, @@ -2932,14 +3007,9 @@ static enum tls_model tls_symbolic_operand_type (rtx addr) { enum tls_model tls_kind = TLS_MODEL_NONE; - if (GET_CODE (addr) == CONST) - { - poly_int64 addend; - rtx sym = strip_offset (addr, &addend); - if (GET_CODE (sym) == SYMBOL_REF) - tls_kind = SYMBOL_REF_TLS_MODEL (sym); - } - else if (GET_CODE (addr) == SYMBOL_REF) + poly_int64 offset; + addr = strip_offset_and_salt (addr, &offset); + if (GET_CODE (addr) == SYMBOL_REF) tls_kind = SYMBOL_REF_TLS_MODEL (addr); return tls_kind; @@ -3404,11 +3474,16 @@ aarch64_split_128bit_move (rtx dst, rtx src) } } +/* Return true if we should split a move from 128-bit value SRC + to 128-bit register DEST. */ + bool aarch64_split_128bit_move_p (rtx dst, rtx src) { - return (! REG_P (src) - || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src)))); + if (FP_REGNUM_P (REGNO (dst))) + return REG_P (src) && !FP_REGNUM_P (REGNO (src)); + /* All moves to GPRs need to be split. */ + return true; } /* Split a complex SIMD combine. */ @@ -3694,24 +3769,6 @@ aarch64_pfalse_reg (machine_mode mode) return gen_lowpart (mode, reg); } -/* Return true if predicate PRED1[0] is true whenever predicate PRED2 is - true, or alternatively if we know that the operation predicated by - PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a - aarch64_sve_gp_strictness operand that describes the operation - predicated by PRED1[0]. */ - -bool -aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2) -{ - machine_mode mode = GET_MODE (pred2); - gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL - && mode == GET_MODE (pred1[0]) - && aarch64_sve_gp_strictness (pred1[1], SImode)); - return (pred1[0] == CONSTM1_RTX (mode) - || INTVAL (pred1[1]) == SVE_RELAXED_GP - || rtx_equal_p (pred1[0], pred2)); -} - /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag for it. PRED2[0] is the predicate for the instruction whose result is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag @@ -5239,6 +5296,48 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) as_a <scalar_int_mode> (mode)); } +/* Return the MEM rtx that provides the canary value that should be used + for stack-smashing protection. MODE is the mode of the memory. + For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable + (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE + indicates whether the caller is performing a SET or a TEST operation. */ + +rtx +aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl, + aarch64_salt_type salt_type) +{ + rtx addr; + if (aarch64_stack_protector_guard == SSP_GLOBAL) + { + gcc_assert (MEM_P (decl_rtl)); + addr = XEXP (decl_rtl, 0); + poly_int64 offset; + rtx base = strip_offset_and_salt (addr, &offset); + if (!SYMBOL_REF_P (base)) + return decl_rtl; + + rtvec v = gen_rtvec (2, base, GEN_INT (salt_type)); + addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR); + addr = gen_rtx_CONST (Pmode, addr); + addr = plus_constant (Pmode, addr, offset); + } + else + { + /* Calculate the address from the system register. */ + rtx salt = GEN_INT (salt_type); + addr = gen_reg_rtx (mode); + if (mode == DImode) + emit_insn (gen_reg_stack_protect_address_di (addr, salt)); + else + { + emit_insn (gen_reg_stack_protect_address_si (addr, salt)); + addr = convert_memory_address (Pmode, addr); + } + addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset); + } + return gen_rtx_MEM (mode, force_reg (Pmode, addr)); +} + /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate that is known to contain PTRUE. */ @@ -8677,8 +8776,6 @@ aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode) static bool aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x) { - rtx base, offset; - if (GET_CODE (x) == HIGH) return true; @@ -8688,10 +8785,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x) if (GET_CODE (*iter) == CONST_POLY_INT) return true; - split_const (x, &base, &offset); + poly_int64 offset; + rtx base = strip_offset_and_salt (x, &offset); if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF) { - if (aarch64_classify_symbol (base, INTVAL (offset)) + /* We checked for POLY_INT_CST offsets above. */ + if (aarch64_classify_symbol (base, offset.to_constant ()) != SYMBOL_FORCE_TO_MEM) return true; else @@ -9217,9 +9316,8 @@ aarch64_classify_address (struct aarch64_address_info *info, && GET_MODE_SIZE (mode).is_constant (&const_size) && const_size >= 4) { - rtx sym, addend; - - split_const (x, &sym, &addend); + poly_int64 offset; + rtx sym = strip_offset_and_salt (x, &offset); return ((GET_CODE (sym) == LABEL_REF || (GET_CODE (sym) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (sym) @@ -9234,10 +9332,12 @@ aarch64_classify_address (struct aarch64_address_info *info, if (allow_reg_index_p && aarch64_base_register_rtx_p (info->base, strict_p)) { - rtx sym, offs; - split_const (info->offset, &sym, &offs); + poly_int64 offset; + HOST_WIDE_INT const_offset; + rtx sym = strip_offset_and_salt (info->offset, &offset); if (GET_CODE (sym) == SYMBOL_REF - && (aarch64_classify_symbol (sym, INTVAL (offs)) + && offset.is_constant (&const_offset) + && (aarch64_classify_symbol (sym, const_offset) == SYMBOL_SMALL_ABSOLUTE)) { /* The symbol and offset must be aligned to the access size. */ @@ -9263,7 +9363,7 @@ aarch64_classify_address (struct aarch64_address_info *info, if (known_eq (ref_size, 0)) ref_size = GET_MODE_SIZE (DImode); - return (multiple_p (INTVAL (offs), ref_size) + return (multiple_p (const_offset, ref_size) && multiple_p (align / BITS_PER_UNIT, ref_size)); } } @@ -9295,9 +9395,8 @@ aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p) bool aarch64_symbolic_address_p (rtx x) { - rtx offset; - - split_const (x, &x, &offset); + poly_int64 offset; + x = strip_offset_and_salt (x, &offset); return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF; } @@ -10028,27 +10127,16 @@ aarch64_print_operand (FILE *f, rtx x, int code) switch (code) { case 'c': - switch (GET_CODE (x)) + if (CONST_INT_P (x)) + fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); + else { - case CONST_INT: - fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); - break; - - case SYMBOL_REF: - output_addr_const (f, x); - break; - - case CONST: - if (GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF) - { - output_addr_const (f, x); - break; - } - /* Fall through. */ - - default: - output_operand_lossage ("unsupported operand for code '%c'", code); + poly_int64 offset; + rtx base = strip_offset_and_salt (x, &offset); + if (SYMBOL_REF_P (base)) + output_addr_const (f, x); + else + output_operand_lossage ("unsupported operand for code '%c'", code); } break; @@ -10623,6 +10711,19 @@ aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x) output_addr_const (f, x); } +/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */ + +static bool +aarch64_output_addr_const_extra (FILE *file, rtx x) +{ + if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR) + { + output_addr_const (file, XVECEXP (x, 0, 0)); + return true; + } + return false; +} + bool aarch64_label_mentioned_p (rtx x) { @@ -15932,6 +16033,7 @@ aarch64_tls_symbol_p (rtx x) if (! TARGET_HAVE_TLS) return false; + x = strip_salt (x); if (GET_CODE (x) != SYMBOL_REF) return false; @@ -15987,6 +16089,8 @@ aarch64_classify_tls_symbol (rtx x) enum aarch64_symbol_type aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset) { + x = strip_salt (x); + if (GET_CODE (x) == LABEL_REF) { switch (aarch64_cmodel) @@ -16086,11 +16190,10 @@ aarch64_constant_address_p (rtx x) bool aarch64_legitimate_pic_operand_p (rtx x) { - if (GET_CODE (x) == SYMBOL_REF - || (GET_CODE (x) == CONST - && GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)) - return false; + poly_int64 offset; + x = strip_offset_and_salt (x, &offset); + if (GET_CODE (x) == SYMBOL_REF) + return false; return true; } @@ -16136,7 +16239,7 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x) /* If an offset is being added to something else, we need to allow the base to be moved into the destination register, meaning that there are no free temporaries for the offset. */ - x = strip_offset (x, &offset); + x = strip_offset_and_salt (x, &offset); if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0) return false; @@ -18035,6 +18138,7 @@ aarch64_mov_operand_p (rtx x, machine_mode mode) return aarch64_simd_valid_immediate (x, NULL); } + x = strip_salt (x); if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x)) return true; @@ -23890,6 +23994,9 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_PRINT_OPERAND_ADDRESS #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address +#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA +#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra + #undef TARGET_OPTAB_SUPPORTED_P #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index dbc6b1d..78fe7c43 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -281,6 +281,7 @@ UNSPEC_GEN_TAG_RND ; Generate a random 4-bit MTE tag. UNSPEC_TAG_SPACE ; Translate address to MTE tag address space. UNSPEC_LD1RO + UNSPEC_SALT_ADDR ]) (define_c_enum "unspecv" [ @@ -1360,13 +1361,14 @@ (define_insn "*movti_aarch64" [(set (match_operand:TI 0 - "nonimmediate_operand" "= r,w, r,w,r,m,m,w,m") + "nonimmediate_operand" "= r,w,w, r,w,r,m,m,w,m") (match_operand:TI 1 - "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m,w"))] + "aarch64_movti_operand" " rUti,Z,r, w,w,m,r,Z,m,w"))] "(register_operand (operands[0], TImode) || aarch64_reg_or_zero (operands[1], TImode))" "@ # + movi\\t%0.2d, #0 # # mov\\t%0.16b, %1.16b @@ -1375,11 +1377,11 @@ stp\\txzr, xzr, %0 ldr\\t%q0, %1 str\\t%q1, %0" - [(set_attr "type" "multiple,f_mcr,f_mrc,neon_logic_q, \ + [(set_attr "type" "multiple,neon_move,f_mcr,f_mrc,neon_logic_q, \ load_16,store_16,store_16,\ load_16,store_16") - (set_attr "length" "8,8,8,4,4,4,4,4,4") - (set_attr "arch" "*,*,*,simd,*,*,*,fp,fp")] + (set_attr "length" "8,4,8,8,4,4,4,4,4,4") + (set_attr "arch" "*,simd,*,*,simd,*,*,*,fp,fp")] ) ;; Split a TImode register-register or register-immediate move into @@ -1510,9 +1512,9 @@ (define_insn "*movtf_aarch64" [(set (match_operand:TF 0 - "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m") + "nonimmediate_operand" "=w,?r ,w ,?r,w,?w,w,m,?r,m ,m") (match_operand:TF 1 - "general_operand" " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))] + "general_operand" " w,?rY,?r,w ,Y,Y ,m,w,m ,?r,Y"))] "TARGET_FLOAT && (register_operand (operands[0], TFmode) || aarch64_reg_or_fp_zero (operands[1], TFmode))" "@ @@ -1535,7 +1537,7 @@ (define_split [(set (match_operand:TF 0 "register_operand" "") - (match_operand:TF 1 "aarch64_reg_or_imm" ""))] + (match_operand:TF 1 "nonmemory_operand" ""))] "reload_completed && aarch64_split_128bit_move_p (operands[0], operands[1])" [(const_int 0)] { @@ -6881,43 +6883,37 @@ DONE; }) -;; Named patterns for stack smashing protection. +;; Defined for -mstack-protector-guard=sysreg, which goes through this +;; pattern rather than stack_protect_combined_set. Our implementation +;; of the latter can handle both. (define_expand "stack_protect_set" [(match_operand 0 "memory_operand") - (match_operand 1 "memory_operand")] + (match_operand 1 "")] "" { - machine_mode mode = GET_MODE (operands[0]); - if (aarch64_stack_protector_guard != SSP_GLOBAL) - { - /* Generate access through the system register. */ - rtx tmp_reg = gen_reg_rtx (mode); - if (mode == DImode) - { - emit_insn (gen_reg_stack_protect_address_di (tmp_reg)); - emit_insn (gen_adddi3 (tmp_reg, tmp_reg, - GEN_INT (aarch64_stack_protector_guard_offset))); - } - else - { - emit_insn (gen_reg_stack_protect_address_si (tmp_reg)); - emit_insn (gen_addsi3 (tmp_reg, tmp_reg, - GEN_INT (aarch64_stack_protector_guard_offset))); + emit_insn (gen_stack_protect_combined_set (operands[0], operands[1])); + DONE; +}) - } - operands[1] = gen_rtx_MEM (mode, tmp_reg); - } - +(define_expand "stack_protect_combined_set" + [(match_operand 0 "memory_operand") + (match_operand 1 "")] + "" +{ + machine_mode mode = GET_MODE (operands[0]); + operands[1] = aarch64_stack_protect_canary_mem (mode, operands[1], + AARCH64_SALT_SSP_SET); emit_insn ((mode == DImode ? gen_stack_protect_set_di : gen_stack_protect_set_si) (operands[0], operands[1])); DONE; }) +;; Operand 1 is either AARCH64_SALT_SSP_SET or AARCH64_SALT_SSP_TEST. (define_insn "reg_stack_protect_address_<mode>" [(set (match_operand:PTR 0 "register_operand" "=r") - (unspec:PTR [(const_int 0)] - UNSPEC_SSP_SYSREG))] + (unspec:PTR [(match_operand 1 "const_int_operand")] + UNSPEC_SSP_SYSREG))] "aarch64_stack_protector_guard != SSP_GLOBAL" { char buf[150]; @@ -6940,37 +6936,29 @@ [(set_attr "length" "12") (set_attr "type" "multiple")]) +;; Defined for -mstack-protector-guard=sysreg, which goes through this +;; pattern rather than stack_protect_combined_test. Our implementation +;; of the latter can handle both. (define_expand "stack_protect_test" [(match_operand 0 "memory_operand") - (match_operand 1 "memory_operand") + (match_operand 1 "") (match_operand 2)] "" { - machine_mode mode = GET_MODE (operands[0]); - - if (aarch64_stack_protector_guard != SSP_GLOBAL) - { - /* Generate access through the system register. The - sequence we want here is the access - of the stack offset to come with - mrs scratch_reg, <system_register> - add scratch_reg, scratch_reg, :lo12:offset. */ - rtx tmp_reg = gen_reg_rtx (mode); - if (mode == DImode) - { - emit_insn (gen_reg_stack_protect_address_di (tmp_reg)); - emit_insn (gen_adddi3 (tmp_reg, tmp_reg, - GEN_INT (aarch64_stack_protector_guard_offset))); - } - else - { - emit_insn (gen_reg_stack_protect_address_si (tmp_reg)); - emit_insn (gen_addsi3 (tmp_reg, tmp_reg, - GEN_INT (aarch64_stack_protector_guard_offset))); + emit_insn (gen_stack_protect_combined_test (operands[0], operands[1], + operands[2])); + DONE; +}) - } - operands[1] = gen_rtx_MEM (mode, tmp_reg); - } +(define_expand "stack_protect_combined_test" + [(match_operand 0 "memory_operand") + (match_operand 1 "") + (match_operand 2)] + "" +{ + machine_mode mode = GET_MODE (operands[0]); + operands[1] = aarch64_stack_protect_canary_mem (mode, operands[1], + AARCH64_SALT_SSP_TEST); emit_insn ((mode == DImode ? gen_stack_protect_test_di : gen_stack_protect_test_si) (operands[0], operands[1])); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 50f8b23..85c0d62 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -6088,6 +6088,20 @@ vreinterpretq_u32_p128 (poly128_t __a) return (uint32x4_t)__a; } +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f64_p128 (poly128_t __a) +{ + return (float64x2_t) __a; +} + +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p128_f64 (float64x2_t __a) +{ + return (poly128_t) __a; +} + /* vset_lane */ __extension__ extern __inline float16x4_t @@ -12670,6 +12684,13 @@ vceqq_u64 (uint64x2_t __a, uint64x2_t __b) return (__a == __b); } +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqq_p64 (poly64x2_t __a, poly64x2_t __b) +{ + return (__a == __b); +} + /* vceq - scalar. */ __extension__ extern __inline uint32_t @@ -12779,6 +12800,13 @@ vceqz_u64 (uint64x1_t __a) return (__a == __AARCH64_UINT64_C (0)); } +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqz_p64 (poly64x1_t __a) +{ + return (__a == __AARCH64_UINT64_C (0)); +} + __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqzq_f32 (float32x4_t __a) @@ -12856,6 +12884,13 @@ vceqzq_u64 (uint64x2_t __a) return (__a == __AARCH64_UINT64_C (0)); } +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqzq_p64 (poly64x2_t __a) +{ + return (__a == __AARCH64_UINT64_C (0)); +} + /* vceqz - scalar. */ __extension__ extern __inline uint32_t @@ -14054,6 +14089,48 @@ vclsq_s32 (int32x4_t __a) return __builtin_aarch64_clrsbv4si (__a); } +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcls_u8 (uint8x8_t __a) +{ + return __builtin_aarch64_clrsbv8qi ((int8x8_t) __a); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcls_u16 (uint16x4_t __a) +{ + return __builtin_aarch64_clrsbv4hi ((int16x4_t) __a); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcls_u32 (uint32x2_t __a) +{ + return __builtin_aarch64_clrsbv2si ((int32x2_t) __a); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclsq_u8 (uint8x16_t __a) +{ + return __builtin_aarch64_clrsbv16qi ((int8x16_t) __a); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclsq_u16 (uint16x8_t __a) +{ + return __builtin_aarch64_clrsbv8hi ((int16x8_t) __a); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclsq_u32 (uint32x4_t __a) +{ + return __builtin_aarch64_clrsbv4si ((int32x4_t) __a); +} + /* vclz. */ __extension__ extern __inline int8x8_t @@ -15538,7 +15615,7 @@ vdupq_n_f64 (float64_t __a) __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p8 (uint32_t __a) +vdupq_n_p8 (poly8_t __a) { return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; @@ -15546,21 +15623,21 @@ vdupq_n_p8 (uint32_t __a) __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p16 (uint32_t __a) +vdupq_n_p16 (poly16_t __a) { return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p64 (uint64_t __a) +vdupq_n_p64 (poly64_t __a) { return (poly64x2_t) {__a, __a}; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s8 (int32_t __a) +vdupq_n_s8 (int8_t __a) { return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; @@ -15568,7 +15645,7 @@ vdupq_n_s8 (int32_t __a) __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s16 (int32_t __a) +vdupq_n_s16 (int16_t __a) { return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } @@ -15589,7 +15666,7 @@ vdupq_n_s64 (int64_t __a) __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u8 (uint32_t __a) +vdupq_n_u8 (uint8_t __a) { return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; @@ -15597,7 +15674,7 @@ vdupq_n_u8 (uint32_t __a) __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u16 (uint32_t __a) +vdupq_n_u16 (uint16_t __a) { return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } @@ -19613,6 +19690,13 @@ vld4q_p64 (const poly64_t * __a) return ret; } +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vldrq_p128 (const poly128_t * __ptr) +{ + return *__ptr; +} + /* vldn_dup */ __extension__ extern __inline int8x8x2_t @@ -23962,42 +24046,42 @@ __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovun_s16 (int16x8_t __a) { - return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a); + return __builtin_aarch64_sqmovunv8hi_us (__a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovun_s32 (int32x4_t __a) { - return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a); + return __builtin_aarch64_sqmovunv4si_us (__a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovun_s64 (int64x2_t __a) { - return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a); + return __builtin_aarch64_sqmovunv2di_us (__a); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovunh_s16 (int16_t __a) { - return (int8_t) __builtin_aarch64_sqmovunhi (__a); + return __builtin_aarch64_sqmovunhi_us (__a); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovuns_s32 (int32_t __a) { - return (int16_t) __builtin_aarch64_sqmovunsi (__a); + return __builtin_aarch64_sqmovunsi_us (__a); } -__extension__ extern __inline int32_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovund_s64 (int64_t __a) { - return (int32_t) __builtin_aarch64_sqmovundi (__a); + return __builtin_aarch64_sqmovundi_us (__a); } /* vqneg */ @@ -24253,28 +24337,28 @@ vqrshld_s64 (int64_t __a, int64_t __b) __extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlb_u8 (uint8_t __a, uint8_t __b) +vqrshlb_u8 (uint8_t __a, int8_t __b) { return __builtin_aarch64_uqrshlqi_uus (__a, __b); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlh_u16 (uint16_t __a, uint16_t __b) +vqrshlh_u16 (uint16_t __a, int16_t __b) { return __builtin_aarch64_uqrshlhi_uus (__a, __b); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshls_u32 (uint32_t __a, uint32_t __b) +vqrshls_u32 (uint32_t __a, int32_t __b) { return __builtin_aarch64_uqrshlsi_uus (__a, __b); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshld_u64 (uint64_t __a, uint64_t __b) +vqrshld_u64 (uint64_t __a, int64_t __b) { return __builtin_aarch64_uqrshldi_uus (__a, __b); } @@ -24553,28 +24637,28 @@ vqshld_s64 (int64_t __a, int64_t __b) __extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlb_u8 (uint8_t __a, uint8_t __b) +vqshlb_u8 (uint8_t __a, int8_t __b) { return __builtin_aarch64_uqshlqi_uus (__a, __b); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlh_u16 (uint16_t __a, uint16_t __b) +vqshlh_u16 (uint16_t __a, int16_t __b) { return __builtin_aarch64_uqshlhi_uus (__a, __b); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshls_u32 (uint32_t __a, uint32_t __b) +vqshls_u32 (uint32_t __a, int32_t __b) { return __builtin_aarch64_uqshlsi_uus (__a, __b); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshld_u64 (uint64_t __a, uint64_t __b) +vqshld_u64 (uint64_t __a, int64_t __b) { return __builtin_aarch64_uqshldi_uus (__a, __b); } @@ -26003,6 +26087,13 @@ vrndmq_f64 (float64x2_t __a) /* vrndn */ +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrndns_f32 (float32_t __a) +{ + return __builtin_aarch64_frintnsf (__a); +} + __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrndn_f32 (float32x2_t __a) @@ -26908,7 +26999,7 @@ vshld_s64 (int64_t __a, int64_t __b) __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshld_u64 (uint64_t __a, uint64_t __b) +vshld_u64 (uint64_t __a, int64_t __b) { return __builtin_aarch64_ushldi_uus (__a, __b); } @@ -30104,6 +30195,13 @@ vst4q_p64 (poly64_t * __a, poly64x2x4_t __val) __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vstrq_p128 (poly128_t * __ptr, poly128_t __val) +{ + *__ptr = __val; +} + /* vsub */ __extension__ extern __inline int64_t @@ -30491,6 +30589,17 @@ vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtrn1q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif +} + __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b) @@ -30761,6 +30870,18 @@ vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } + +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtrn2q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif +} + __extension__ extern __inline float16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtrn_f16 (float16x4_t __a, float16x4_t __b) @@ -31407,6 +31528,17 @@ vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vuzp1q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif +} + __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vuzp2_f16 (float16x4_t __a, float16x4_t __b) @@ -31666,6 +31798,17 @@ vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vuzp2q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif +} + __INTERLEAVE_LIST (uzp) /* vzip */ @@ -31934,6 +32077,17 @@ vzip1q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vzip1q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif +} + __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vzip2_f16 (float16x4_t __a, float16x4_t __b) @@ -32198,6 +32352,17 @@ vzip2q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vzip2q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif +} + __INTERLEAVE_LIST (zip) #undef __INTERLEAVE_LIST @@ -35659,6 +35824,55 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) #pragma GCC pop_options +__extension__ extern __inline poly8x8_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly16x4_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p16 (poly16x4_t __a, poly16x4_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly64x1_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p64 (poly64x1_t __a, poly64x1_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly8x16_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly16x8_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p16 (poly16x8_t __a, poly16x8_t __b) +{ + return __a ^__b; +} + +__extension__ extern __inline poly64x2_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p64 (poly64x2_t __a, poly64x2_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly128_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p128 (poly128_t __a, poly128_t __b) +{ + return __a ^ __b; +} + #undef __aarch64_vget_lane_any #undef __aarch64_vdup_lane_any diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index 33e8015..db505a4 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -811,23 +811,23 @@ arm_ldrgbwbu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] static enum arm_type_qualifiers arm_strsbwbs_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_void, qualifier_unsigned, qualifier_const, qualifier_none}; + = { qualifier_unsigned, qualifier_unsigned, qualifier_const, qualifier_none}; #define STRSBWBS_QUALIFIERS (arm_strsbwbs_qualifiers) static enum arm_type_qualifiers arm_strsbwbu_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_void, qualifier_unsigned, qualifier_const, qualifier_unsigned}; + = { qualifier_unsigned, qualifier_unsigned, qualifier_const, qualifier_unsigned}; #define STRSBWBU_QUALIFIERS (arm_strsbwbu_qualifiers) static enum arm_type_qualifiers arm_strsbwbs_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_void, qualifier_unsigned, qualifier_const, + = { qualifier_unsigned, qualifier_unsigned, qualifier_const, qualifier_none, qualifier_unsigned}; #define STRSBWBS_P_QUALIFIERS (arm_strsbwbs_p_qualifiers) static enum arm_type_qualifiers arm_strsbwbu_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_void, qualifier_unsigned, qualifier_const, + = { qualifier_unsigned, qualifier_unsigned, qualifier_const, qualifier_unsigned, qualifier_unsigned}; #define STRSBWBU_P_QUALIFIERS (arm_strsbwbu_p_qualifiers) diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in index c98f8ed..8c61ad0 100644 --- a/gcc/config/arm/arm-cpus.in +++ b/gcc/config/arm/arm-cpus.in @@ -135,10 +135,6 @@ define feature armv8_1m_main # Floating point and Neon extensions. # VFPv1 is not supported in GCC. -# This feature bit is enabled for all VFP, MVE and -# MVE with floating point extensions. -define feature vfp_base - # Vector floating point v2. define feature vfpv2 @@ -251,7 +247,7 @@ define fgroup ALL_SIMD ALL_SIMD_INTERNAL ALL_SIMD_EXTERNAL # List of all FPU bits to strip out if -mfpu is used to override the # default. fp16 is deliberately missing from this list. -define fgroup ALL_FPU_INTERNAL vfp_base vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD_INTERNAL +define fgroup ALL_FPU_INTERNAL vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD_INTERNAL # Similarly, but including fp16 and other extensions that aren't part of # -mfpu support. define fgroup ALL_FPU_EXTERNAL fp16 bf16 @@ -296,11 +292,11 @@ define fgroup ARMv8r ARMv8a define fgroup ARMv8_1m_main ARMv8m_main armv8_1m_main # Useful combinations. -define fgroup VFPv2 vfp_base vfpv2 +define fgroup VFPv2 vfpv2 define fgroup VFPv3 VFPv2 vfpv3 define fgroup VFPv4 VFPv3 vfpv4 fp16conv define fgroup FPv5 VFPv4 fpv5 -define fgroup MVE mve vfp_base armv7em +define fgroup MVE mve armv7em define fgroup MVE_FP MVE FPv5 fp16 mve_float define fgroup FP_DBL fp_dbl @@ -310,6 +306,18 @@ define fgroup NEON FP_D32 neon define fgroup CRYPTO NEON crypto define fgroup DOTPROD NEON dotprod +# Implied feature bits. These are for non-named features shared between fgroups. +# Shared feature f belonging to fgroups A and B will be erroneously removed if: +# A and B are enabled by default AND A is disabled by a removal flag. +# To ensure that f is retained, we must add such bits to the ISA after +# processing the removal flags. This is implemented by 'implied bits': +# define implied <name> [<feature-or-fgroup>]+ +# This indicates that, if any of the listed features are enabled, or if any +# member of a listed fgroup is enabled, then <name> will be implicitly enabled. + +# Enabled for all VFP, MVE and MVE with floating point extensions. +define implied vfp_base MVE MVE_FP ALL_FP + # List of all quirk bits to strip out when comparing CPU features with # architectures. # xscale isn't really a 'quirk', but it isn't an architecture either and we @@ -1447,6 +1455,39 @@ begin cpu cortex-a77 part d0d end cpu cortex-a77 +begin cpu cortex-a78 + cname cortexa78 + tune for cortex-a57 + tune flags LDSCHED + architecture armv8.2-a+fp16+dotprod + option crypto add FP_ARMv8 CRYPTO + costs cortex_a57 + vendor 41 + part d41 +end cpu cortex-a78 + +begin cpu cortex-a78ae + cname cortexa78ae + tune for cortex-a57 + tune flags LDSCHED + architecture armv8.2-a+fp16+dotprod + option crypto add FP_ARMv8 CRYPTO + costs cortex_a57 + vendor 41 + part d42 +end cpu cortex-a78ae + +begin cpu cortex-x1 + cname cortexx1 + tune for cortex-a57 + tune flags LDSCHED + architecture armv8.2-a+fp16+dotprod + option crypto add FP_ARMv8 CRYPTO + costs cortex_a57 + vendor 41 + part d44 +end cpu cortex-x1 + begin cpu neoverse-n1 cname neoversen1 alias !ares @@ -1478,6 +1519,30 @@ begin cpu cortex-a76.cortex-a55 costs cortex_a57 end cpu cortex-a76.cortex-a55 +# Armv8.4 A-profile Architecture Processors +begin cpu neoverse-v1 + cname neoversev1 + tune for cortex-a57 + tune flags LDSCHED + architecture armv8.4-a+fp16+bf16+i8mm + option crypto add FP_ARMv8 CRYPTO + costs cortex_a57 + vendor 41 + part 0xd40 +end cpu neoverse-v1 + +# Armv8.5 A-profile Architecture Processors +begin cpu neoverse-n2 + cname neoversen2 + tune for cortex-a57 + tune flags LDSCHED + architecture armv8.5-a+fp16+bf16+i8mm + option crypto add FP_ARMv8 CRYPTO + costs cortex_a57 + vendor 41 + part 0xd49 +end cpu neoverse-n2 + # V8 M-profile implementations. begin cpu cortex-m23 cname cortexm23 @@ -1508,6 +1573,10 @@ begin cpu cortex-m55 cname cortexm55 tune flags LDSCHED architecture armv8.1-m.main+mve.fp+fp.dp + option nomve.fp remove mve_float + option nomve remove mve mve_float + option nofp remove ALL_FP mve_float + option nodsp remove MVE mve_float isa quirk_no_asmcpu costs v7m vendor 41 diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 0cc0ae7..703d616 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -120,7 +120,6 @@ extern int arm_coproc_mem_operand_no_writeback (rtx); extern int arm_coproc_mem_operand_wb (rtx, int); extern int neon_vector_mem_operand (rtx, int, bool); extern int mve_vector_mem_operand (machine_mode, rtx, bool); -bool arm_mve_mode_and_operands_type_check (machine_mode, rtx, rtx); extern int neon_struct_mem_operand (rtx); extern rtx *neon_vcmla_lane_prepare_operands (rtx *); @@ -373,9 +372,11 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, extern bool arm_fusion_enabled_p (tune_params::fuse_ops); extern bool arm_valid_symbolic_address_p (rtx); extern bool arm_validize_comparison (rtx *, rtx *, rtx *); +extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool); #endif /* RTX_CODE */ extern bool arm_gen_setmem (rtx *); +extern void arm_expand_vcond (rtx *, machine_mode); extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel); extern bool arm_autoinc_modes_ok_p (machine_mode, enum arm_auto_incmodes); diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt index ce35661..05f5c08 100644 --- a/gcc/config/arm/arm-tables.opt +++ b/gcc/config/arm/arm-tables.opt @@ -241,6 +241,15 @@ EnumValue Enum(processor_type) String(cortex-a77) Value( TARGET_CPU_cortexa77) EnumValue +Enum(processor_type) String(cortex-a78) Value( TARGET_CPU_cortexa78) + +EnumValue +Enum(processor_type) String(cortex-a78ae) Value( TARGET_CPU_cortexa78ae) + +EnumValue +Enum(processor_type) String(cortex-x1) Value( TARGET_CPU_cortexx1) + +EnumValue Enum(processor_type) String(neoverse-n1) Value( TARGET_CPU_neoversen1) EnumValue @@ -250,6 +259,12 @@ EnumValue Enum(processor_type) String(cortex-a76.cortex-a55) Value( TARGET_CPU_cortexa76cortexa55) EnumValue +Enum(processor_type) String(neoverse-v1) Value( TARGET_CPU_neoversev1) + +EnumValue +Enum(processor_type) String(neoverse-n2) Value( TARGET_CPU_neoversen2) + +EnumValue Enum(processor_type) String(cortex-m23) Value( TARGET_CPU_cortexm23) EnumValue diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md index 8ea9435..32657da 100644 --- a/gcc/config/arm/arm-tune.md +++ b/gcc/config/arm/arm-tune.md @@ -45,7 +45,9 @@ cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35, cortexa73cortexa53,cortexa55,cortexa75, cortexa76,cortexa76ae,cortexa77, + cortexa78,cortexa78ae,cortexx1, neoversen1,cortexa75cortexa55,cortexa76cortexa55, - cortexm23,cortexm33,cortexm35p, - cortexm55,cortexr52" + neoversev1,neoversen2,cortexm23, + cortexm33,cortexm35p,cortexm55, + cortexr52" (const (symbol_ref "((enum attr_tune) arm_tune)"))) diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 022ef6c..dfadaca 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3391,6 +3391,20 @@ arm_configure_build_target (struct arm_build_target *target, bitmap_ior (target->isa, target->isa, fpu_bits); } + /* There may be implied bits which we still need to enable. These are + non-named features which are needed to complete other sets of features, + but cannot be enabled from arm-cpus.in due to being shared between + multiple fgroups. Each entry in all_implied_fbits is of the form + ante -> cons, meaning that if the feature "ante" is enabled, we should + implicitly enable "cons". */ + const struct fbit_implication *impl = all_implied_fbits; + while (impl->ante) + { + if (bitmap_bit_p (target->isa, impl->ante)) + bitmap_set_bit (target->isa, impl->cons); + impl++; + } + if (!arm_selected_tune) arm_selected_tune = arm_selected_cpu; else /* Validate the features passed to -mtune. */ @@ -3415,8 +3429,9 @@ arm_option_override (void) { static const enum isa_feature fpu_bitlist_internal[] = { ISA_ALL_FPU_INTERNAL, isa_nobit }; + /* isa_bit_mve_float is also part of FP bit list for arch v8.1-m.main. */ static const enum isa_feature fp_bitlist[] - = { ISA_ALL_FP, isa_nobit }; + = { ISA_ALL_FP, isa_bit_mve_float, isa_nobit }; static const enum isa_feature quirk_bitlist[] = { ISA_ALL_QUIRKS, isa_nobit}; cl_target_option opts; @@ -13277,14 +13292,18 @@ arm_coproc_mem_operand_wb (rtx op, int wb_level) /* Match: (plus (reg) - (const)). */ + (const)) + + The encoded immediate for 16-bit modes is multiplied by 2, + while the encoded immediate for 32-bit and 64-bit modes is + multiplied by 4. */ + int factor = MIN (GET_MODE_SIZE (GET_MODE (op)), 4); if (GET_CODE (ind) == PLUS && REG_P (XEXP (ind, 0)) && REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode) && CONST_INT_P (XEXP (ind, 1)) - && INTVAL (XEXP (ind, 1)) > -1024 - && INTVAL (XEXP (ind, 1)) < 1024 - && (INTVAL (XEXP (ind, 1)) & 3) == 0) + && IN_RANGE (INTVAL (XEXP (ind, 1)), -255 * factor, 255 * factor) + && (INTVAL (XEXP (ind, 1)) & (factor - 1)) == 0) return TRUE; return FALSE; @@ -28946,6 +28965,30 @@ arm_preferred_simd_mode (scalar_mode mode) default:; } + if (TARGET_HAVE_MVE) + switch (mode) + { + case E_QImode: + return V16QImode; + case E_HImode: + return V8HImode; + case E_SImode: + return V4SImode; + + default:; + } + + if (TARGET_HAVE_MVE_FLOAT) + switch (mode) + { + case E_HFmode: + return V8HFmode; + case E_SFmode: + return V4SFmode; + + default:; + } + return word_mode; } @@ -30630,6 +30673,127 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem, arm_post_atomic_barrier (model); } +/* Expand code to compare vectors OP0 and OP1 using condition CODE. + If CAN_INVERT, store either the result or its inverse in TARGET + and return true if TARGET contains the inverse. If !CAN_INVERT, + always store the result in TARGET, never its inverse. + + Note that the handling of floating-point comparisons is not + IEEE compliant. */ + +bool +arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, + bool can_invert) +{ + machine_mode cmp_result_mode = GET_MODE (target); + machine_mode cmp_mode = GET_MODE (op0); + + bool inverted; + switch (code) + { + /* For these we need to compute the inverse of the requested + comparison. */ + case UNORDERED: + case UNLT: + case UNLE: + case UNGT: + case UNGE: + case UNEQ: + case NE: + code = reverse_condition_maybe_unordered (code); + if (!can_invert) + { + /* Recursively emit the inverted comparison into a temporary + and then store its inverse in TARGET. This avoids reusing + TARGET (which for integer NE could be one of the inputs). */ + rtx tmp = gen_reg_rtx (cmp_result_mode); + if (arm_expand_vector_compare (tmp, code, op0, op1, true)) + gcc_unreachable (); + emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp))); + return false; + } + inverted = true; + break; + + default: + inverted = false; + break; + } + + switch (code) + { + /* These are natively supported for zero comparisons, but otherwise + require the operands to be swapped. */ + case LE: + case LT: + if (op1 != CONST0_RTX (cmp_mode)) + { + code = swap_condition (code); + std::swap (op0, op1); + } + /* Fall through. */ + + /* These are natively supported for both register and zero operands. */ + case EQ: + case GE: + case GT: + emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1)); + return inverted; + + /* These are natively supported for register operands only. + Comparisons with zero aren't useful and should be folded + or canonicalized by target-independent code. */ + case GEU: + case GTU: + emit_insn (gen_neon_vc (code, cmp_mode, target, + op0, force_reg (cmp_mode, op1))); + return inverted; + + /* These require the operands to be swapped and likewise do not + support comparisons with zero. */ + case LEU: + case LTU: + emit_insn (gen_neon_vc (swap_condition (code), cmp_mode, + target, force_reg (cmp_mode, op1), op0)); + return inverted; + + /* These need a combination of two comparisons. */ + case LTGT: + case ORDERED: + { + /* Operands are LTGT iff (a > b || a > b). + Operands are ORDERED iff (a > b || a <= b). */ + rtx gt_res = gen_reg_rtx (cmp_result_mode); + rtx alt_res = gen_reg_rtx (cmp_result_mode); + rtx_code alt_code = (code == LTGT ? LT : LE); + if (arm_expand_vector_compare (gt_res, GT, op0, op1, true) + || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true)) + gcc_unreachable (); + emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode, + gt_res, alt_res))); + return inverted; + } + + default: + gcc_unreachable (); + } +} + +/* Expand a vcond or vcondu pattern with operands OPERANDS. + CMP_RESULT_MODE is the mode of the comparison result. */ + +void +arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) +{ + rtx mask = gen_reg_rtx (cmp_result_mode); + bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]), + operands[4], operands[5], true); + if (inverted) + std::swap (operands[1], operands[2]); + emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0], + mask, operands[1], operands[2])); +} + #define MAX_VECT_LEN 16 struct expand_vec_perm_d @@ -33112,9 +33276,7 @@ arm_expand_divmod_libfunc (rtx libfunc, machine_mode mode, = smallest_int_mode_for_size (2 * GET_MODE_BITSIZE (mode)); rtx libval = emit_library_call_value (libfunc, NULL_RTX, LCT_CONST, - libval_mode, - op0, GET_MODE (op0), - op1, GET_MODE (op1)); + libval_mode, op0, mode, op1, mode); rtx quotient = simplify_gen_subreg (mode, libval, libval_mode, 0); rtx remainder = simplify_gen_subreg (mode, libval, libval_mode, @@ -33578,17 +33740,4 @@ arm_mode_base_reg_class (machine_mode mode) struct gcc_target targetm = TARGET_INITIALIZER; -bool -arm_mve_mode_and_operands_type_check (machine_mode mode, rtx op0, rtx op1) -{ - if (!(TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT)) - return true; - else if (mode == E_BFmode) - return false; - else if ((s_register_operand (op0, mode) && MEM_P (op1)) - || (s_register_operand (op1, mode) && MEM_P (op0))) - return false; - return true; -} - #include "gt-arm.h" diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index f4d3676..4a63d33 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -1110,6 +1110,47 @@ extern const int arm_arch_cde_coproc_bits[]; #define VALID_MVE_STRUCT_MODE(MODE) \ ((MODE) == TImode || (MODE) == OImode || (MODE) == XImode) +/* The conditions under which vector modes are supported for general + arithmetic using Neon. */ + +#define ARM_HAVE_NEON_V8QI_ARITH TARGET_NEON +#define ARM_HAVE_NEON_V4HI_ARITH TARGET_NEON +#define ARM_HAVE_NEON_V2SI_ARITH TARGET_NEON + +#define ARM_HAVE_NEON_V16QI_ARITH TARGET_NEON +#define ARM_HAVE_NEON_V8HI_ARITH TARGET_NEON +#define ARM_HAVE_NEON_V4SI_ARITH TARGET_NEON +#define ARM_HAVE_NEON_V2DI_ARITH TARGET_NEON + +/* HF operations have their own flush-to-zero control (FPSCR.FZ16). */ +#define ARM_HAVE_NEON_V4HF_ARITH TARGET_NEON_FP16INST +#define ARM_HAVE_NEON_V8HF_ARITH TARGET_NEON_FP16INST + +/* SF operations always flush to zero, regardless of FPSCR.FZ, so we can + only use them for general arithmetic when -funsafe-math-optimizations + is in effect. */ +#define ARM_HAVE_NEON_V2SF_ARITH \ + (TARGET_NEON && flag_unsafe_math_optimizations) +#define ARM_HAVE_NEON_V4SF_ARITH ARM_HAVE_NEON_V2SF_ARITH + +/* The conditions under which vector modes are supported for general + arithmetic by any vector extension. */ + +#define ARM_HAVE_V8QI_ARITH (ARM_HAVE_NEON_V8QI_ARITH || TARGET_REALLY_IWMMXT) +#define ARM_HAVE_V4HI_ARITH (ARM_HAVE_NEON_V4HI_ARITH || TARGET_REALLY_IWMMXT) +#define ARM_HAVE_V2SI_ARITH (ARM_HAVE_NEON_V2SI_ARITH || TARGET_REALLY_IWMMXT) + +#define ARM_HAVE_V16QI_ARITH (ARM_HAVE_NEON_V16QI_ARITH || TARGET_HAVE_MVE) +#define ARM_HAVE_V8HI_ARITH (ARM_HAVE_NEON_V8HI_ARITH || TARGET_HAVE_MVE) +#define ARM_HAVE_V4SI_ARITH (ARM_HAVE_NEON_V4SI_ARITH || TARGET_HAVE_MVE) +#define ARM_HAVE_V2DI_ARITH ARM_HAVE_NEON_V2DI_ARITH + +#define ARM_HAVE_V4HF_ARITH ARM_HAVE_NEON_V4HF_ARITH +#define ARM_HAVE_V2SF_ARITH ARM_HAVE_NEON_V2SF_ARITH + +#define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH || TARGET_HAVE_MVE_FLOAT) +#define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH || TARGET_HAVE_MVE_FLOAT) + /* The register numbers in sequence, for passing to arm_gen_load_multiple. */ extern int arm_regs_in_sequence[]; diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index bffdb0b..1a8e498 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -7289,7 +7289,9 @@ (define_insn "*arm32_mov<mode>" [(set (match_operand:HFBF 0 "nonimmediate_operand" "=r,m,r,r") (match_operand:HFBF 1 "general_operand" " m,r,r,F"))] - "TARGET_32BIT && !TARGET_HARD_FLOAT + "TARGET_32BIT + && !TARGET_HARD_FLOAT + && !TARGET_HAVE_MVE && ( s_register_operand (operands[0], <MODE>mode) || s_register_operand (operands[1], <MODE>mode))" "* @@ -7355,7 +7357,7 @@ if (arm_disable_literal_pool && (REG_P (operands[0]) || SUBREG_P (operands[0])) && CONST_DOUBLE_P (operands[1]) - && TARGET_HARD_FLOAT + && TARGET_VFP_BASE && !vfp3_const_double_rtx (operands[1])) { rtx clobreg = gen_reg_rtx (SFmode); @@ -7452,7 +7454,7 @@ if (arm_disable_literal_pool && (REG_P (operands[0]) || SUBREG_P (operands[0])) && CONSTANT_P (operands[1]) - && TARGET_HARD_FLOAT + && TARGET_VFP_BASE && !arm_const_double_rtx (operands[1]) && !(TARGET_VFP_DOUBLE && vfp3_const_double_rtx (operands[1]))) { @@ -9212,7 +9214,7 @@ operands[2] = operands[1]; else { - rtx mem = XEXP (force_const_mem (SImode, operands[1]), 0); + rtx mem = force_const_mem (SImode, operands[1]); emit_move_insn (operands[2], mem); } } @@ -9295,7 +9297,7 @@ operands[3] = operands[1]; else { - rtx mem = XEXP (force_const_mem (SImode, operands[1]), 0); + rtx mem = force_const_mem (SImode, operands[1]); emit_move_insn (operands[3], mem); } } diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index a801705..6c0d1e2 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -141,6 +141,7 @@ #define vrev64q_m(__inactive, __a, __p) __arm_vrev64q_m(__inactive, __a, __p) #define vqrdmlashq(__a, __b, __c) __arm_vqrdmlashq(__a, __b, __c) #define vqrdmlahq(__a, __b, __c) __arm_vqrdmlahq(__a, __b, __c) +#define vqdmlashq(__a, __b, __c) __arm_vqdmlashq(__a, __b, __c) #define vqdmlahq(__a, __b, __c) __arm_vqdmlahq(__a, __b, __c) #define vmvnq_m(__inactive, __a, __p) __arm_vmvnq_m(__inactive, __a, __p) #define vmlasq(__a, __b, __c) __arm_vmlasq(__a, __b, __c) @@ -260,6 +261,7 @@ #define vorrq_m(__inactive, __a, __b, __p) __arm_vorrq_m(__inactive, __a, __b, __p) #define vqaddq_m(__inactive, __a, __b, __p) __arm_vqaddq_m(__inactive, __a, __b, __p) #define vqdmladhq_m(__inactive, __a, __b, __p) __arm_vqdmladhq_m(__inactive, __a, __b, __p) +#define vqdmlashq_m(__a, __b, __c, __p) __arm_vqdmlashq_m(__a, __b, __c, __p) #define vqdmladhxq_m(__inactive, __a, __b, __p) __arm_vqdmladhxq_m(__inactive, __a, __b, __p) #define vqdmlahq_m(__a, __b, __c, __p) __arm_vqdmlahq_m(__a, __b, __c, __p) #define vqdmlsdhq_m(__inactive, __a, __b, __p) __arm_vqdmlsdhq_m(__inactive, __a, __b, __p) @@ -643,6 +645,7 @@ #define vcvtpq_u16_f16(__a) __arm_vcvtpq_u16_f16(__a) #define vcvtpq_u32_f32(__a) __arm_vcvtpq_u32_f32(__a) #define vcvtnq_u16_f16(__a) __arm_vcvtnq_u16_f16(__a) +#define vcvtnq_u32_f32(__a) __arm_vcvtnq_u32_f32(__a) #define vcvtmq_u16_f16(__a) __arm_vcvtmq_u16_f16(__a) #define vcvtmq_u32_f32(__a) __arm_vcvtmq_u32_f32(__a) #define vcvtaq_u16_f16(__a) __arm_vcvtaq_u16_f16(__a) @@ -1234,9 +1237,6 @@ #define vpselq_u8(__a, __b, __p) __arm_vpselq_u8(__a, __b, __p) #define vpselq_s8(__a, __b, __p) __arm_vpselq_s8(__a, __b, __p) #define vrev64q_m_u8(__inactive, __a, __p) __arm_vrev64q_m_u8(__inactive, __a, __p) -#define vqrdmlashq_n_u8(__a, __b, __c) __arm_vqrdmlashq_n_u8(__a, __b, __c) -#define vqrdmlahq_n_u8(__a, __b, __c) __arm_vqrdmlahq_n_u8(__a, __b, __c) -#define vqdmlahq_n_u8(__a, __b, __c) __arm_vqdmlahq_n_u8(__a, __b, __c) #define vmvnq_m_u8(__inactive, __a, __p) __arm_vmvnq_m_u8(__inactive, __a, __p) #define vmlasq_n_u8(__a, __b, __c) __arm_vmlasq_n_u8(__a, __b, __c) #define vmlaq_n_u8(__a, __b, __c) __arm_vmlaq_n_u8(__a, __b, __c) @@ -1306,6 +1306,7 @@ #define vqdmlsdhxq_s8(__inactive, __a, __b) __arm_vqdmlsdhxq_s8(__inactive, __a, __b) #define vqdmlsdhq_s8(__inactive, __a, __b) __arm_vqdmlsdhq_s8(__inactive, __a, __b) #define vqdmlahq_n_s8(__a, __b, __c) __arm_vqdmlahq_n_s8(__a, __b, __c) +#define vqdmlashq_n_s8(__a, __b, __c) __arm_vqdmlashq_n_s8(__a, __b, __c) #define vqdmladhxq_s8(__inactive, __a, __b) __arm_vqdmladhxq_s8(__inactive, __a, __b) #define vqdmladhq_s8(__inactive, __a, __b) __arm_vqdmladhq_s8(__inactive, __a, __b) #define vmlsdavaxq_s8(__a, __b, __c) __arm_vmlsdavaxq_s8(__a, __b, __c) @@ -1319,9 +1320,6 @@ #define vpselq_u16(__a, __b, __p) __arm_vpselq_u16(__a, __b, __p) #define vpselq_s16(__a, __b, __p) __arm_vpselq_s16(__a, __b, __p) #define vrev64q_m_u16(__inactive, __a, __p) __arm_vrev64q_m_u16(__inactive, __a, __p) -#define vqrdmlashq_n_u16(__a, __b, __c) __arm_vqrdmlashq_n_u16(__a, __b, __c) -#define vqrdmlahq_n_u16(__a, __b, __c) __arm_vqrdmlahq_n_u16(__a, __b, __c) -#define vqdmlahq_n_u16(__a, __b, __c) __arm_vqdmlahq_n_u16(__a, __b, __c) #define vmvnq_m_u16(__inactive, __a, __p) __arm_vmvnq_m_u16(__inactive, __a, __p) #define vmlasq_n_u16(__a, __b, __c) __arm_vmlasq_n_u16(__a, __b, __c) #define vmlaq_n_u16(__a, __b, __c) __arm_vmlaq_n_u16(__a, __b, __c) @@ -1390,6 +1388,7 @@ #define vqrdmladhq_s16(__inactive, __a, __b) __arm_vqrdmladhq_s16(__inactive, __a, __b) #define vqdmlsdhxq_s16(__inactive, __a, __b) __arm_vqdmlsdhxq_s16(__inactive, __a, __b) #define vqdmlsdhq_s16(__inactive, __a, __b) __arm_vqdmlsdhq_s16(__inactive, __a, __b) +#define vqdmlashq_n_s16(__a, __b, __c) __arm_vqdmlashq_n_s16(__a, __b, __c) #define vqdmlahq_n_s16(__a, __b, __c) __arm_vqdmlahq_n_s16(__a, __b, __c) #define vqdmladhxq_s16(__inactive, __a, __b) __arm_vqdmladhxq_s16(__inactive, __a, __b) #define vqdmladhq_s16(__inactive, __a, __b) __arm_vqdmladhq_s16(__inactive, __a, __b) @@ -1404,9 +1403,6 @@ #define vpselq_u32(__a, __b, __p) __arm_vpselq_u32(__a, __b, __p) #define vpselq_s32(__a, __b, __p) __arm_vpselq_s32(__a, __b, __p) #define vrev64q_m_u32(__inactive, __a, __p) __arm_vrev64q_m_u32(__inactive, __a, __p) -#define vqrdmlashq_n_u32(__a, __b, __c) __arm_vqrdmlashq_n_u32(__a, __b, __c) -#define vqrdmlahq_n_u32(__a, __b, __c) __arm_vqrdmlahq_n_u32(__a, __b, __c) -#define vqdmlahq_n_u32(__a, __b, __c) __arm_vqdmlahq_n_u32(__a, __b, __c) #define vmvnq_m_u32(__inactive, __a, __p) __arm_vmvnq_m_u32(__inactive, __a, __p) #define vmlasq_n_u32(__a, __b, __c) __arm_vmlasq_n_u32(__a, __b, __c) #define vmlaq_n_u32(__a, __b, __c) __arm_vmlaq_n_u32(__a, __b, __c) @@ -1475,6 +1471,7 @@ #define vqrdmladhq_s32(__inactive, __a, __b) __arm_vqrdmladhq_s32(__inactive, __a, __b) #define vqdmlsdhxq_s32(__inactive, __a, __b) __arm_vqdmlsdhxq_s32(__inactive, __a, __b) #define vqdmlsdhq_s32(__inactive, __a, __b) __arm_vqdmlsdhq_s32(__inactive, __a, __b) +#define vqdmlashq_n_s32(__a, __b, __c) __arm_vqdmlashq_n_s32(__a, __b, __c) #define vqdmlahq_n_s32(__a, __b, __c) __arm_vqdmlahq_n_s32(__a, __b, __c) #define vqdmladhxq_s32(__inactive, __a, __b) __arm_vqdmladhxq_s32(__inactive, __a, __b) #define vqdmladhq_s32(__inactive, __a, __b) __arm_vqdmladhq_s32(__inactive, __a, __b) @@ -1901,6 +1898,9 @@ #define vqdmladhxq_m_s8(__inactive, __a, __b, __p) __arm_vqdmladhxq_m_s8(__inactive, __a, __b, __p) #define vqdmladhxq_m_s32(__inactive, __a, __b, __p) __arm_vqdmladhxq_m_s32(__inactive, __a, __b, __p) #define vqdmladhxq_m_s16(__inactive, __a, __b, __p) __arm_vqdmladhxq_m_s16(__inactive, __a, __b, __p) +#define vqdmlashq_m_n_s8(__a, __b, __c, __p) __arm_vqdmlashq_m_n_s8(__a, __b, __c, __p) +#define vqdmlashq_m_n_s32(__a, __b, __c, __p) __arm_vqdmlashq_m_n_s32(__a, __b, __c, __p) +#define vqdmlashq_m_n_s16(__a, __b, __c, __p) __arm_vqdmlashq_m_n_s16(__a, __b, __c, __p) #define vqdmlahq_m_n_s8(__a, __b, __c, __p) __arm_vqdmlahq_m_n_s8(__a, __b, __c, __p) #define vqdmlahq_m_n_s32(__a, __b, __c, __p) __arm_vqdmlahq_m_n_s32(__a, __b, __c, __p) #define vqdmlahq_m_n_s16(__a, __b, __c, __p) __arm_vqdmlahq_m_n_s16(__a, __b, __c, __p) @@ -2024,8 +2024,6 @@ #define vmlaldavaq_p_u16(__a, __b, __c, __p) __arm_vmlaldavaq_p_u16(__a, __b, __c, __p) #define vmlaldavaxq_p_s32(__a, __b, __c, __p) __arm_vmlaldavaxq_p_s32(__a, __b, __c, __p) #define vmlaldavaxq_p_s16(__a, __b, __c, __p) __arm_vmlaldavaxq_p_s16(__a, __b, __c, __p) -#define vmlaldavaxq_p_u32(__a, __b, __c, __p) __arm_vmlaldavaxq_p_u32(__a, __b, __c, __p) -#define vmlaldavaxq_p_u16(__a, __b, __c, __p) __arm_vmlaldavaxq_p_u16(__a, __b, __c, __p) #define vmlsldavaq_p_s32(__a, __b, __c, __p) __arm_vmlsldavaq_p_s32(__a, __b, __c, __p) #define vmlsldavaq_p_s16(__a, __b, __c, __p) __arm_vmlsldavaq_p_s16(__a, __b, __c, __p) #define vmlsldavaxq_p_s32(__a, __b, __c, __p) __arm_vmlsldavaxq_p_s32(__a, __b, __c, __p) @@ -6961,27 +6959,6 @@ __arm_vrev64q_m_u8 (uint8x16_t __inactive, uint8x16_t __a, mve_pred16_t __p) __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlashq_n_u8 (uint8x16_t __a, uint8x16_t __b, uint8_t __c) -{ - return __builtin_mve_vqrdmlashq_n_uv16qi (__a, __b, __c); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlahq_n_u8 (uint8x16_t __a, uint8x16_t __b, uint8_t __c) -{ - return __builtin_mve_vqrdmlahq_n_uv16qi (__a, __b, __c); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqdmlahq_n_u8 (uint8x16_t __a, uint8x16_t __b, uint8_t __c) -{ - return __builtin_mve_vqdmlahq_n_uv16qi (__a, __b, __c); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vmvnq_m_u8 (uint8x16_t __inactive, uint8x16_t __a, mve_pred16_t __p) { return __builtin_mve_vmvnq_m_uv16qi (__inactive, __a, __p); @@ -7424,6 +7401,13 @@ __arm_vqrdmlashq_n_s8 (int8x16_t __a, int8x16_t __b, int8_t __c) __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq_n_s8 (int8x16_t __a, int8x16_t __b, int8_t __c) +{ + return __builtin_mve_vqdmlashq_n_sv16qi (__a, __b, __c); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vqrdmlahq_n_s8 (int8x16_t __a, int8x16_t __b, int8_t __c) { return __builtin_mve_vqrdmlahq_n_sv16qi (__a, __b, __c); @@ -7557,27 +7541,6 @@ __arm_vrev64q_m_u16 (uint16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p) __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlashq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c) -{ - return __builtin_mve_vqrdmlashq_n_uv8hi (__a, __b, __c); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlahq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c) -{ - return __builtin_mve_vqrdmlahq_n_uv8hi (__a, __b, __c); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqdmlahq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c) -{ - return __builtin_mve_vqdmlahq_n_uv8hi (__a, __b, __c); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vmvnq_m_u16 (uint16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p) { return __builtin_mve_vmvnq_m_uv8hi (__inactive, __a, __p); @@ -8019,6 +7982,13 @@ __arm_vqrdmlashq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c) __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c) +{ + return __builtin_mve_vqdmlashq_n_sv8hi (__a, __b, __c); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vqrdmlahq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c) { return __builtin_mve_vqrdmlahq_n_sv8hi (__a, __b, __c); @@ -8152,27 +8122,6 @@ __arm_vrev64q_m_u32 (uint32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p) __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlashq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c) -{ - return __builtin_mve_vqrdmlashq_n_uv4si (__a, __b, __c); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlahq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c) -{ - return __builtin_mve_vqrdmlahq_n_uv4si (__a, __b, __c); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqdmlahq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c) -{ - return __builtin_mve_vqdmlahq_n_uv4si (__a, __b, __c); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vmvnq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p) { return __builtin_mve_vmvnq_m_uv4si (__inactive, __a, __p); @@ -8614,6 +8563,13 @@ __arm_vqrdmlashq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c) __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c) +{ + return __builtin_mve_vqdmlashq_n_sv4si (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vqrdmlahq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c) { return __builtin_mve_vqrdmlahq_n_sv4si (__a, __b, __c); @@ -11141,6 +11097,27 @@ __arm_vqrdmlashq_m_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c, mve_pred16_ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq_m_n_s8 (int8x16_t __a, int8x16_t __b, int8_t __c, mve_pred16_t __p) +{ + return __builtin_mve_vqdmlashq_m_n_sv16qi (__a, __b, __c, __p); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq_m_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c, mve_pred16_t __p) +{ + return __builtin_mve_vqdmlashq_m_n_sv8hi (__a, __b, __c, __p); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq_m_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c, mve_pred16_t __p) +{ + return __builtin_mve_vqdmlashq_m_n_sv4si (__a, __b, __c, __p); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vqrdmlsdhq_m_s8 (int8x16_t __inactive, int8x16_t __a, int8x16_t __b, mve_pred16_t __p) { return __builtin_mve_vqrdmlsdhq_m_sv16qi (__inactive, __a, __b, __p); @@ -11811,20 +11788,6 @@ __arm_vmlaldavaxq_p_s16 (int64_t __a, int16x8_t __b, int16x8_t __c, mve_pred16_t return __builtin_mve_vmlaldavaxq_p_sv8hi (__a, __b, __c, __p); } -__extension__ extern __inline uint64_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vmlaldavaxq_p_u32 (uint64_t __a, uint32x4_t __b, uint32x4_t __c, mve_pred16_t __p) -{ - return __builtin_mve_vmlaldavaxq_p_uv4si (__a, __b, __c, __p); -} - -__extension__ extern __inline uint64_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vmlaldavaxq_p_u16 (uint64_t __a, uint16x8_t __b, uint16x8_t __c, mve_pred16_t __p) -{ - return __builtin_mve_vmlaldavaxq_p_uv8hi (__a, __b, __c, __p); -} - __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vmlsldavaq_p_s32 (int64_t __a, int32x4_t __b, int32x4_t __c, mve_pred16_t __p) @@ -13993,64 +13956,56 @@ __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrdq_scatter_base_wb_s64 (uint64x2_t * __addr, const int __offset, int64x2_t __value) { - __builtin_mve_vstrdq_scatter_base_wb_sv2di (*__addr, __offset, __value); - __builtin_mve_vstrdq_scatter_base_wb_add_sv2di (*__addr, __offset, *__addr); + *__addr = __builtin_mve_vstrdq_scatter_base_wb_sv2di (*__addr, __offset, __value); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrdq_scatter_base_wb_u64 (uint64x2_t * __addr, const int __offset, uint64x2_t __value) { - __builtin_mve_vstrdq_scatter_base_wb_uv2di (*__addr, __offset, __value); - __builtin_mve_vstrdq_scatter_base_wb_add_uv2di (*__addr, __offset, *__addr); + *__addr = __builtin_mve_vstrdq_scatter_base_wb_uv2di (*__addr, __offset, __value); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrdq_scatter_base_wb_p_s64 (uint64x2_t * __addr, const int __offset, int64x2_t __value, mve_pred16_t __p) { - __builtin_mve_vstrdq_scatter_base_wb_p_sv2di (*__addr, __offset, __value, __p); - __builtin_mve_vstrdq_scatter_base_wb_p_add_sv2di (*__addr, __offset, *__addr, __p); + *__addr = __builtin_mve_vstrdq_scatter_base_wb_p_sv2di (*__addr, __offset, __value, __p); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrdq_scatter_base_wb_p_u64 (uint64x2_t * __addr, const int __offset, uint64x2_t __value, mve_pred16_t __p) { - __builtin_mve_vstrdq_scatter_base_wb_p_uv2di (*__addr, __offset, __value, __p); - __builtin_mve_vstrdq_scatter_base_wb_p_add_uv2di (*__addr, __offset, *__addr, __p); + *__addr = __builtin_mve_vstrdq_scatter_base_wb_p_uv2di (*__addr, __offset, __value, __p); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrwq_scatter_base_wb_p_s32 (uint32x4_t * __addr, const int __offset, int32x4_t __value, mve_pred16_t __p) { - __builtin_mve_vstrwq_scatter_base_wb_p_sv4si (*__addr, __offset, __value, __p); - __builtin_mve_vstrwq_scatter_base_wb_p_add_sv4si (*__addr, __offset, *__addr, __p); + *__addr = __builtin_mve_vstrwq_scatter_base_wb_p_sv4si (*__addr, __offset, __value, __p); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrwq_scatter_base_wb_p_u32 (uint32x4_t * __addr, const int __offset, uint32x4_t __value, mve_pred16_t __p) { - __builtin_mve_vstrwq_scatter_base_wb_p_uv4si (*__addr, __offset, __value, __p); - __builtin_mve_vstrwq_scatter_base_wb_p_add_uv4si (*__addr, __offset, *__addr, __p); + *__addr = __builtin_mve_vstrwq_scatter_base_wb_p_uv4si (*__addr, __offset, __value, __p); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrwq_scatter_base_wb_s32 (uint32x4_t * __addr, const int __offset, int32x4_t __value) { - __builtin_mve_vstrwq_scatter_base_wb_sv4si (*__addr, __offset, __value); - __builtin_mve_vstrwq_scatter_base_wb_add_sv4si (*__addr, __offset, *__addr); + *__addr = __builtin_mve_vstrwq_scatter_base_wb_sv4si (*__addr, __offset, __value); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint32x4_t __value) { - __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value); - __builtin_mve_vstrwq_scatter_base_wb_add_uv4si (*__addr, __offset, *__addr); + *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value); } __extension__ extern __inline uint8x16_t @@ -17012,6 +16967,13 @@ __arm_vcvtnq_u16_f16 (float16x8_t __a) return __builtin_mve_vcvtnq_uv8hi (__a); } +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vcvtnq_u32_f32 (float32x4_t __a) +{ + return __builtin_mve_vcvtnq_uv4si (__a); +} + __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcvtmq_u16_f16 (float16x8_t __a) @@ -19158,16 +19120,14 @@ __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrwq_scatter_base_wb_f32 (uint32x4_t * __addr, const int __offset, float32x4_t __value) { - __builtin_mve_vstrwq_scatter_base_wb_fv4sf (*__addr, __offset, __value); - __builtin_mve_vstrwq_scatter_base_wb_add_fv4sf (*__addr, __offset, *__addr); + *__addr = __builtin_mve_vstrwq_scatter_base_wb_fv4sf (*__addr, __offset, __value); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrwq_scatter_base_wb_p_f32 (uint32x4_t * __addr, const int __offset, float32x4_t __value, mve_pred16_t __p) { - __builtin_mve_vstrwq_scatter_base_wb_p_fv4sf (*__addr, __offset, __value, __p); - __builtin_mve_vstrwq_scatter_base_wb_p_add_fv4sf (*__addr, __offset, *__addr, __p); + *__addr = __builtin_mve_vstrwq_scatter_base_wb_p_fv4sf (*__addr, __offset, __value, __p); } __extension__ extern __inline float16x8_t @@ -23742,27 +23702,6 @@ __arm_vrev64q_m (uint8x16_t __inactive, uint8x16_t __a, mve_pred16_t __p) __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlashq (uint8x16_t __a, uint8x16_t __b, uint8_t __c) -{ - return __arm_vqrdmlashq_n_u8 (__a, __b, __c); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlahq (uint8x16_t __a, uint8x16_t __b, uint8_t __c) -{ - return __arm_vqrdmlahq_n_u8 (__a, __b, __c); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqdmlahq (uint8x16_t __a, uint8x16_t __b, uint8_t __c) -{ - return __arm_vqdmlahq_n_u8 (__a, __b, __c); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vmvnq_m (uint8x16_t __inactive, uint8x16_t __a, mve_pred16_t __p) { return __arm_vmvnq_m_u8 (__inactive, __a, __p); @@ -24204,6 +24143,13 @@ __arm_vqrdmlashq (int8x16_t __a, int8x16_t __b, int8_t __c) __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq (int8x16_t __a, int8x16_t __b, int8_t __c) +{ + return __arm_vqdmlashq_n_s8 (__a, __b, __c); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vqrdmlahq (int8x16_t __a, int8x16_t __b, int8_t __c) { return __arm_vqrdmlahq_n_s8 (__a, __b, __c); @@ -24337,27 +24283,6 @@ __arm_vrev64q_m (uint16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p) __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlashq (uint16x8_t __a, uint16x8_t __b, uint16_t __c) -{ - return __arm_vqrdmlashq_n_u16 (__a, __b, __c); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlahq (uint16x8_t __a, uint16x8_t __b, uint16_t __c) -{ - return __arm_vqrdmlahq_n_u16 (__a, __b, __c); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqdmlahq (uint16x8_t __a, uint16x8_t __b, uint16_t __c) -{ - return __arm_vqdmlahq_n_u16 (__a, __b, __c); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vmvnq_m (uint16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p) { return __arm_vmvnq_m_u16 (__inactive, __a, __p); @@ -24799,6 +24724,13 @@ __arm_vqrdmlashq (int16x8_t __a, int16x8_t __b, int16_t __c) __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq (int16x8_t __a, int16x8_t __b, int16_t __c) +{ + return __arm_vqdmlashq_n_s16 (__a, __b, __c); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vqrdmlahq (int16x8_t __a, int16x8_t __b, int16_t __c) { return __arm_vqrdmlahq_n_s16 (__a, __b, __c); @@ -24932,27 +24864,6 @@ __arm_vrev64q_m (uint32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p) __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlashq (uint32x4_t __a, uint32x4_t __b, uint32_t __c) -{ - return __arm_vqrdmlashq_n_u32 (__a, __b, __c); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqrdmlahq (uint32x4_t __a, uint32x4_t __b, uint32_t __c) -{ - return __arm_vqrdmlahq_n_u32 (__a, __b, __c); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vqdmlahq (uint32x4_t __a, uint32x4_t __b, uint32_t __c) -{ - return __arm_vqdmlahq_n_u32 (__a, __b, __c); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vmvnq_m (uint32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p) { return __arm_vmvnq_m_u32 (__inactive, __a, __p); @@ -25394,6 +25305,13 @@ __arm_vqrdmlashq (int32x4_t __a, int32x4_t __b, int32_t __c) __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq (int32x4_t __a, int32x4_t __b, int32_t __c) +{ + return __arm_vqdmlashq_n_s32 (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vqrdmlahq (int32x4_t __a, int32x4_t __b, int32_t __c) { return __arm_vqrdmlahq_n_s32 (__a, __b, __c); @@ -27921,6 +27839,27 @@ __arm_vqrdmlashq_m (int16x8_t __a, int16x8_t __b, int16_t __c, mve_pred16_t __p) __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq_m (int8x16_t __a, int8x16_t __b, int8_t __c, mve_pred16_t __p) +{ + return __arm_vqdmlashq_m_n_s8 (__a, __b, __c, __p); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq_m (int16x8_t __a, int16x8_t __b, int16_t __c, mve_pred16_t __p) +{ + return __arm_vqdmlashq_m_n_s16 (__a, __b, __c, __p); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__arm_vqdmlashq_m (int32x4_t __a, int32x4_t __b, int32_t __c, mve_pred16_t __p) +{ + return __arm_vqdmlashq_m_n_s32 (__a, __b, __c, __p); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vqrdmlsdhq_m (int8x16_t __inactive, int8x16_t __a, int8x16_t __b, mve_pred16_t __p) { return __arm_vqrdmlsdhq_m_s8 (__inactive, __a, __b, __p); @@ -28591,20 +28530,6 @@ __arm_vmlaldavaxq_p (int64_t __a, int16x8_t __b, int16x8_t __c, mve_pred16_t __p return __arm_vmlaldavaxq_p_s16 (__a, __b, __c, __p); } -__extension__ extern __inline uint64_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vmlaldavaxq_p (uint64_t __a, uint32x4_t __b, uint32x4_t __c, mve_pred16_t __p) -{ - return __arm_vmlaldavaxq_p_u32 (__a, __b, __c, __p); -} - -__extension__ extern __inline uint64_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vmlaldavaxq_p (uint64_t __a, uint16x8_t __b, uint16x8_t __c, mve_pred16_t __p) -{ - return __arm_vmlaldavaxq_p_u16 (__a, __b, __c, __p); -} - __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vmlsldavaq_p (int64_t __a, int32x4_t __b, int32x4_t __c, mve_pred16_t __p) @@ -35651,6 +35576,7 @@ enum { short: __ARM_mve_type_int_n, \ int: __ARM_mve_type_int_n, \ long: __ARM_mve_type_int_n, \ + double: __ARM_mve_type_fp_n, \ long long: __ARM_mve_type_int_n, \ unsigned char: __ARM_mve_type_int_n, \ unsigned short: __ARM_mve_type_int_n, \ @@ -35723,6 +35649,8 @@ extern void *__ARM_undef; _Generic(param, type: param, default: *(type *)__ARM_undef) #define __ARM_mve_coerce1(param, type) \ _Generic(param, type: param, const type: param, default: *(type *)__ARM_undef) +#define __ARM_mve_coerce2(param, type) \ + _Generic(param, type: param, float16_t: param, float32_t: param, default: *(type *)__ARM_undef) #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point. */ @@ -35939,14 +35867,14 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vaddq_f16 (__ARM_mve_coerce(p0, float16x8_t), __ARM_mve_coerce(p1, float16x8_t)), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vaddq_f32 (__ARM_mve_coerce(p0, float32x4_t), __ARM_mve_coerce(p1, float32x4_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));}) + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));}) #define __arm_vandq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -35997,8 +35925,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vmulq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vmulq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vmulq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)), \ + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)), \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmulq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmulq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmulq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ @@ -36029,8 +35957,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)), \ + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)), \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpeqq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpeqq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpeqq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ @@ -36069,8 +35997,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t), p2), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpeqq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpeqq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2));}) #define __arm_vcmpgtq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -36083,8 +36011,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpgtq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t)), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpgtq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));}) #define __arm_vcmpleq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -36097,8 +36025,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpleq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpleq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpleq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));}) #define __arm_vcmpltq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -36111,8 +36039,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpltq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpltq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t)), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpltq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));}) #define __arm_vcmpneq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -36123,8 +36051,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)), \ + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)), \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpneq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpneq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpneq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ @@ -36179,8 +36107,8 @@ extern void *__ARM_undef; #define __arm_vmaxnmavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmavq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmavq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmavq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmavq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));}) #define __arm_vmaxnmq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -36191,14 +36119,14 @@ extern void *__ARM_undef; #define __arm_vmaxnmvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));}) #define __arm_vmaxnmvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));}) #define __arm_vminnmaq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -36209,8 +36137,8 @@ extern void *__ARM_undef; #define __arm_vminnmavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmavq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmavq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmavq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmavq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));}) #define __arm_vbrsrq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ @@ -36232,8 +36160,8 @@ extern void *__ARM_undef; #define __arm_vsubq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)), \ + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)), \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \ @@ -36252,8 +36180,8 @@ extern void *__ARM_undef; #define __arm_vminnmvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmvq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmvq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmvq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmvq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));}) #define __arm_vshlq_r(p0,p1) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ @@ -36782,10 +36710,15 @@ extern void *__ARM_undef; _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));}) + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));}) + +#define __arm_vqdmlashq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p1) __p1 = (p1); \ + __typeof(p2) __p2 = (p2); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));}) #define __arm_vqrdmlahq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -36793,10 +36726,7 @@ extern void *__ARM_undef; _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));}) + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));}) #define __arm_vmlasq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -36815,10 +36745,7 @@ extern void *__ARM_undef; _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));}) + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));}) #define __arm_vqrdmladhxq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37011,8 +36938,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2), \ + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpgtq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpgtq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) @@ -37027,8 +36954,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpleq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpleq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpleq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2));}) #define __arm_vcmpltq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37041,8 +36968,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpltq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpltq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpltq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2));}) #define __arm_vcmpneq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37061,8 +36988,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t), p2), \ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t), p2), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t), p2), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2));}) #define __arm_vcvtbq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37116,8 +37043,8 @@ extern void *__ARM_undef; __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmaq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmaq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t)), \ + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmaq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmaq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double)), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vfmaq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t)), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vfmaq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t)));}) @@ -37132,8 +37059,8 @@ extern void *__ARM_undef; __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmasq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmasq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t)));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmasq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmasq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double)));}) #define __arm_vmaxnmaq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37156,14 +37083,14 @@ extern void *__ARM_undef; #define __arm_vmaxnmavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmavq_p_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmavq_p_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmavq_p_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmavq_p_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));}) #define __arm_vmaxnmvq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_p_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_p_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_p_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_p_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));}) #define __arm_vminnmaq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37174,14 +37101,14 @@ extern void *__ARM_undef; #define __arm_vminnmavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmavq_p_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmavq_p_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmavq_p_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmavq_p_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));}) #define __arm_vminnmvq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmvq_p_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmvq_p_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmvq_p_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmvq_p_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));}) #define __arm_vrndnq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37248,8 +37175,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpgeq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpgeq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t)), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpgeq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));}) #define __arm_vrshrnbq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37353,8 +37280,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpgeq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpgeq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpgeq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2), \ + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpgeq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpgeq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) @@ -37389,8 +37316,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int), p3), \ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int), p3), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));}) #define __arm_vandq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37531,15 +37458,15 @@ extern void *__ARM_undef; _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vfmaq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vfmaq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmaq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmaq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmaq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmaq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));}) #define __arm_vfmasq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmasq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmasq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmasq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmasq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));}) #define __arm_vfmsq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37580,8 +37507,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vmulq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t), p3), \ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vmulq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t), p3), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vmulq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));}) #define __arm_vornq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -37614,8 +37541,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t), p3), \ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t), p3), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));}) #define __arm_vorrq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -38113,8 +38040,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t), p3), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vaddq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vaddq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));}) #define __arm_vandq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ @@ -38248,8 +38175,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vmulq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t), p3), \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vmulq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vmulq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));}) #define __arm_vnegq_x(p1,p2) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ @@ -38337,8 +38264,8 @@ extern void *__ARM_undef; _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vsubq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \ int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vsubq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));}) + int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \ + int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));}) #define __arm_vcmulq_rot90_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ @@ -38370,8 +38297,8 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vsetq_lane_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vsetq_lane_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t), p2), \ int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint64x2_t]: __arm_vsetq_lane_u64 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint64x2_t), p2), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vsetq_lane_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vsetq_lane_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vsetq_lane_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \ + int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vsetq_lane_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));}) #else /* MVE Integer. */ @@ -38895,12 +38822,12 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vaddq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vaddq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)));}) + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int)), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int)));}) #define __arm_vandq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -39254,10 +39181,15 @@ extern void *__ARM_undef; _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));}) + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));}) + +#define __arm_vqdmlashq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p1) __p1 = (p1); \ + __typeof(p2) __p2 = (p2); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));}) #define __arm_vqrdmlahq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -39265,10 +39197,7 @@ extern void *__ARM_undef; _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));}) + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));}) #define __arm_vqrdmladhxq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -39399,10 +39328,7 @@ extern void *__ARM_undef; _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));}) + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));}) #define __arm_vqdmlsdhq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -40800,6 +40726,14 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t), p3), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t), p3));}) +#define __arm_vqdmlashq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p1) __p1 = (p1); \ + __typeof(p2) __p2 = (p2); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t), p3), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t), p3), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t), p3));}) + #define __arm_vqrshlq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ @@ -41057,9 +40991,7 @@ extern void *__ARM_undef; __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlaldavaxq_p_s16 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlaldavaxq_p_s32 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vmlaldavaxq_p_u16 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vmlaldavaxq_p_u32 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlaldavaxq_p_s32 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3));}) #define __arm_vmlsldavaq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -41679,16 +41611,16 @@ extern void *__ARM_undef; #define __arm_vmaxavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxavq_s8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxavq_s16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxavq_s32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, int32x4_t)));}) + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxavq_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxavq_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxavq_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t)));}) #define __arm_vmaxavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxavq_p_s8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxavq_p_s16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxavq_p_s32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, int32x4_t), p2));}) + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxavq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxavq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxavq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2));}) #define __arm_vmaxq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ @@ -41703,36 +41635,36 @@ extern void *__ARM_undef; #define __arm_vmaxvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxvq_s8 (__ARM_mve_coerce(__p0, int8_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxvq_s16 (__ARM_mve_coerce(__p0, int16_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxvq_s32 (__ARM_mve_coerce(__p0, int32_t), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vmaxvq_u8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vmaxvq_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vmaxvq_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t)));}) + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxvq_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxvq_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxvq_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vmaxvq_u8 (__p0, __ARM_mve_coerce(__p1, uint8x16_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vmaxvq_u16 (__p0, __ARM_mve_coerce(__p1, uint16x8_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vmaxvq_u32 (__p0,__ARM_mve_coerce(__p1, uint32x4_t)));}) #define __arm_vmaxvq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxvq_p_s8 (__ARM_mve_coerce(__p0, int8_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxvq_p_s16 (__ARM_mve_coerce(__p0, int16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxvq_p_s32 (__ARM_mve_coerce(__p0, int32_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vmaxvq_p_u8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vmaxvq_p_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vmaxvq_p_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxvq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxvq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxvq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vmaxvq_p_u8 (__p0, __ARM_mve_coerce(__p1, uint8x16_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vmaxvq_p_u16 (__p0, __ARM_mve_coerce(__p1, uint16x8_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vmaxvq_p_u32 (__p0, __ARM_mve_coerce(__p1, uint32x4_t), p2));}) #define __arm_vminavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminavq_s8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminavq_s16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminavq_s32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, int32x4_t)));}) + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminavq_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminavq_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminavq_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t)));}) #define __arm_vminavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminavq_p_s8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminavq_p_s16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminavq_p_s32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, int32x4_t), p2));}) + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminavq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminavq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminavq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2));}) #define __arm_vminq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ @@ -41747,22 +41679,22 @@ extern void *__ARM_undef; #define __arm_vminvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminvq_s8 (__ARM_mve_coerce(__p0, int8_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminvq_s16 (__ARM_mve_coerce(__p0, int16_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminvq_s32 (__ARM_mve_coerce(__p0, int32_t), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vminvq_u8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vminvq_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vminvq_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t)));}) + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminvq_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminvq_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminvq_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vminvq_u8 (__p0, __ARM_mve_coerce(__p1, uint8x16_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vminvq_u16 (__p0, __ARM_mve_coerce(__p1, uint16x8_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vminvq_u32 (__p0, __ARM_mve_coerce(__p1, uint32x4_t)));}) #define __arm_vminvq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminvq_p_s8 (__ARM_mve_coerce(__p0, int8_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminvq_p_s16 (__ARM_mve_coerce(__p0, int16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminvq_p_s32 (__ARM_mve_coerce(__p0, int32_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vminvq_p_u8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vminvq_p_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vminvq_p_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminvq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminvq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminvq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vminvq_p_u8 (__p0, __ARM_mve_coerce(__p1, uint8x16_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vminvq_p_u16 (__p0, __ARM_mve_coerce(__p1, uint16x8_t), p2), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vminvq_p_u32 (__p0, __ARM_mve_coerce(__p1, uint32x4_t), p2));}) #define __arm_vmladavaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index 753e40a..f38926f 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -312,9 +312,6 @@ VAR3 (TERNOP_NONE_NONE_UNONE_IMM, vshlcq_vec_s, v16qi, v8hi, v4si) VAR4 (TERNOP_UNONE_UNONE_UNONE_UNONE, vpselq_u, v16qi, v8hi, v4si, v2di) VAR4 (TERNOP_NONE_NONE_NONE_UNONE, vpselq_s, v16qi, v8hi, v4si, v2di) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev64q_m_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqrdmlashq_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqrdmlahq_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqdmlahq_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmvnq_m_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlasq_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlaq_n_u, v16qi, v8hi, v4si) @@ -384,6 +381,7 @@ VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqrdmladhq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmlsdhxq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmlsdhq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmlahq_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmlashq_n_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmladhxq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmladhq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmlsdavaxq_s, v16qi, v8hi, v4si) @@ -574,6 +572,7 @@ VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulhq_m_n_s, v16qi, v8hi, v4si) VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlsdhxq_m_s, v16qi, v8hi, v4si) VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlsdhq_m_s, v16qi, v8hi, v4si) VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlahq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlashq_m_n_s, v16qi, v8hi, v4si) VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmladhxq_m_s, v16qi, v8hi, v4si) VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmladhq_m_s, v16qi, v8hi, v4si) VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqaddq_m_s, v16qi, v8hi, v4si) @@ -615,7 +614,6 @@ VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vrshrq_m_n_s, v16qi, v8hi, v4si) VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqshlq_m_n_s, v16qi, v8hi, v4si) VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulltq_poly_m_p, v16qi, v8hi) VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmullbq_poly_m_p, v16qi, v8hi) -VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlaldavaxq_p_u, v8hi, v4si) VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlaldavaq_p_u, v8hi, v4si) VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrntq_m_n_u, v8hi, v4si) VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrnbq_m_n_u, v8hi, v4si) @@ -828,19 +826,9 @@ VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vidupq_m_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vdwdupq_n_u, v16qi, v4si, v8hi) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, viwdupq_n_u, v16qi, v4si, v8hi) VAR1 (STRSBWBU, vstrwq_scatter_base_wb_u, v4si) -VAR1 (STRSBWBU, vstrwq_scatter_base_wb_add_u, v4si) -VAR1 (STRSBWBU, vstrwq_scatter_base_wb_add_s, v4si) -VAR1 (STRSBWBU, vstrwq_scatter_base_wb_add_f, v4sf) VAR1 (STRSBWBU, vstrdq_scatter_base_wb_u, v2di) -VAR1 (STRSBWBU, vstrdq_scatter_base_wb_add_u, v2di) -VAR1 (STRSBWBU, vstrdq_scatter_base_wb_add_s, v2di) VAR1 (STRSBWBU_P, vstrwq_scatter_base_wb_p_u, v4si) -VAR1 (STRSBWBU_P, vstrwq_scatter_base_wb_p_add_u, v4si) -VAR1 (STRSBWBU_P, vstrwq_scatter_base_wb_p_add_s, v4si) -VAR1 (STRSBWBU_P, vstrwq_scatter_base_wb_p_add_f, v4sf) VAR1 (STRSBWBU_P, vstrdq_scatter_base_wb_p_u, v2di) -VAR1 (STRSBWBU_P, vstrdq_scatter_base_wb_p_add_u, v2di) -VAR1 (STRSBWBU_P, vstrdq_scatter_base_wb_p_add_s, v2di) VAR1 (STRSBWBS, vstrwq_scatter_base_wb_s, v4si) VAR1 (STRSBWBS, vstrwq_scatter_base_wb_f, v4sf) VAR1 (STRSBWBS, vstrdq_scatter_base_wb_s, v2di) diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md index ff229aa..789e333 100644 --- a/gcc/config/arm/constraints.md +++ b/gcc/config/arm/constraints.md @@ -454,10 +454,13 @@ (define_memory_constraint "Uj" "@internal - In ARM/Thumb-2 state an VFP load/store address which does not support - writeback at all (eg vldr.16)." + In ARM/Thumb-2 state a VFP load/store address that supports writeback + for Neon but not for MVE" (and (match_code "mem") - (match_test "TARGET_32BIT && arm_coproc_mem_operand_no_writeback (op)"))) + (match_test "TARGET_32BIT") + (match_test "TARGET_HAVE_MVE + ? arm_coproc_mem_operand_no_writeback (op) + : neon_vector_mem_operand (op, 2, true)"))) (define_memory_constraint "Uy" "@internal diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 0bc9eba..f934872 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -66,14 +66,6 @@ ;; Integer and float modes supported by Neon and IWMMXT. (define_mode_iterator VALL [V2DI V2SI V4HI V8QI V2SF V4SI V8HI V16QI V4SF]) -;; Integer and float modes supported by Neon, IWMMXT and MVE, used by -;; arithmetic epxand patterns. -(define_mode_iterator VNIM [V16QI V8HI V4SI V4SF]) - -;; Integer and float modes supported by Neon and IWMMXT but not MVE, used by -;; arithmetic epxand patterns. -(define_mode_iterator VNINOTM [V2SI V4HI V8QI V2SF V2DI]) - ;; Integer and float modes supported by Neon, IWMMXT and MVE. (define_mode_iterator VNIM1 [V16QI V8HI V4SI V4SF V2DI]) @@ -267,6 +259,16 @@ (define_mode_iterator VBFCVT [V4BF V8BF]) (define_mode_iterator VBFCVTM [V2SI SF]) +;; MVE mode iterator. +(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF]) +(define_mode_iterator MVE_VLD_ST [V16QI V8HI V4SI V8HF V4SF]) +(define_mode_iterator MVE_0 [V8HF V4SF]) +(define_mode_iterator MVE_1 [V16QI V8HI V4SI V2DI]) +(define_mode_iterator MVE_3 [V16QI V8HI]) +(define_mode_iterator MVE_2 [V16QI V8HI V4SI]) +(define_mode_iterator MVE_5 [V8HI V4SI]) +(define_mode_iterator MVE_6 [V8HI V4SI]) + ;;---------------------------------------------------------------------------- ;; Code iterators ;;---------------------------------------------------------------------------- @@ -901,6 +903,35 @@ (define_mode_attr cde_suffix [(SI "") (DI "d")]) (define_mode_attr cde_dest [(SI "%0") (DI "%0, %H0")]) +;;MVE mode attribute. +(define_mode_attr MVE_CNVT [(V8HI "V8HF") (V4SI "V4SF") (V8HF "V8HI") + (V4SF "V4SI")]) +(define_mode_attr MVE_LANES [(V16QI "16") (V8HI "8") (V4SI "4")]) + +(define_mode_attr MVE_constraint [ (V16QI "Ra") (V8HI "Rc") (V4SI "Re")]) +(define_mode_attr MVE_constraint1 [ (V8HI "Ra") (V4SI "Rc")]) +(define_mode_attr MVE_constraint2 [(V16QI "Rb") (V8HI "Rd") (V4SI "Rf") + (V8HF "Rd") (V4SF "Rf")]) +(define_mode_attr MVE_constraint3 [ (V8HI "Rb") (V4SI "Rd")]) + +(define_mode_attr MVE_pred [ (V16QI "mve_imm_7") (V8HI "mve_imm_15") + (V4SI "mve_imm_31")]) +(define_mode_attr MVE_pred1 [ (V8HI "mve_imm_7") (V4SI "mve_imm_15")]) +(define_mode_attr MVE_pred2 [(V16QI "mve_imm_8") (V8HI "mve_imm_16") + (V4SI "mve_imm_32") + (V8HF "mve_imm_16") (V4SF "mve_imm_32")]) +(define_mode_attr MVE_pred3 [ (V8HI "mve_imm_8") (V4SI "mve_imm_16")]) + +(define_mode_attr MVE_B_ELEM [ (V16QI "V16QI") (V8HI "V8QI") (V4SI "V4QI")]) +(define_mode_attr MVE_H_ELEM [ (V8HI "V8HI") (V4SI "V4HI")]) + +(define_mode_attr V_sz_elem1 [(V16QI "b") (V8HI "h") (V4SI "w") (V8HF "h") + (V4SF "w")]) +(define_mode_attr V_extr_elem [(V16QI "u8") (V8HI "u16") (V4SI "32") + (V8HF "u16") (V4SF "32")]) +(define_mode_attr earlyclobber_32 [(V16QI "=w") (V8HI "=w") (V4SI "=&w") + (V8HF "=w") (V4SF "=&w")]) + ;;---------------------------------------------------------------------------- ;; Code attributes ;;---------------------------------------------------------------------------- @@ -1181,6 +1212,188 @@ (define_int_attr mmla_sfx [(UNSPEC_MATMUL_S "s8") (UNSPEC_MATMUL_U "u8") (UNSPEC_MATMUL_US "s8")]) +;;MVE int attribute. +(define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s") + (VREV16Q_U "u") (VMVNQ_N_S "s") (VMVNQ_N_U "u") + (VCVTAQ_U "u") (VCVTAQ_S "s") (VREV64Q_S "s") + (VREV64Q_U "u") (VMVNQ_S "s") (VMVNQ_U "u") + (VDUPQ_N_U "u") (VDUPQ_N_S"s") (VADDVQ_S "s") + (VADDVQ_U "u") (VADDVQ_S "s") (VADDVQ_U "u") + (VMOVLTQ_U "u") (VMOVLTQ_S "s") (VMOVLBQ_S "s") + (VMOVLBQ_U "u") (VCVTQ_FROM_F_S "s") (VCVTQ_FROM_F_U "u") + (VCVTPQ_S "s") (VCVTPQ_U "u") (VCVTNQ_S "s") + (VCVTNQ_U "u") (VCVTMQ_S "s") (VCVTMQ_U "u") + (VCLZQ_U "u") (VCLZQ_S "s") (VREV32Q_U "u") + (VREV32Q_S "s") (VADDLVQ_U "u") (VADDLVQ_S "s") + (VCVTQ_N_TO_F_S "s") (VCVTQ_N_TO_F_U "u") + (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s") + (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u") + (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s") + (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s") + (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s") + (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u") + (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VANDQ_S "s") + (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u") + (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s") + (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s") + (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u") + (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s") + (VCMPNEQ_N_U "u") (VEORQ_S "s") (VEORQ_U "u") + (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s") + (VHADDQ_U "u") (VHSUBQ_N_S "s") (VHSUBQ_N_U "u") + (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u") + (VMAXVQ_S "s") (VMAXVQ_U "u") (VMINQ_S "s") (VMINQ_U "u") + (VMINVQ_S "s") (VMINVQ_U "u") (VMLADAVQ_S "s") + (VMLADAVQ_U "u") (VMULHQ_S "s") (VMULHQ_U "u") + (VMULLBQ_INT_S "s") (VMULLBQ_INT_U "u") (VQADDQ_S "s") + (VMULLTQ_INT_S "s") (VMULLTQ_INT_U "u") (VQADDQ_U "u") + (VMULQ_N_S "s") (VMULQ_N_U "u") (VMULQ_S "s") + (VMULQ_U "u") (VORNQ_S "s") (VORNQ_U "u") (VORRQ_S "s") + (VORRQ_U "u") (VQADDQ_N_S "s") (VQADDQ_N_U "u") + (VQRSHLQ_N_S "s") (VQRSHLQ_N_U "u") (VQRSHLQ_S "s") + (VQRSHLQ_U "u") (VQSHLQ_N_S "s") (VQSHLQ_N_U "u") + (VQSHLQ_R_S "s") (VQSHLQ_R_U "u") (VQSHLQ_S "s") + (VQSHLQ_U "u") (VQSUBQ_N_S "s") (VQSUBQ_N_U "u") + (VQSUBQ_S "s") (VQSUBQ_U "u") (VRHADDQ_S "s") + (VRHADDQ_U "u") (VRMULHQ_S "s") (VRMULHQ_U "u") + (VRSHLQ_N_S "s") (VRSHLQ_N_U "u") (VRSHLQ_S "s") + (VRSHLQ_U "u") (VRSHRQ_N_S "s") (VRSHRQ_N_U "u") + (VSHLQ_N_S "s") (VSHLQ_N_U "u") (VSHLQ_R_S "s") + (VSHLQ_R_U "u") (VSUBQ_N_S "s") (VSUBQ_N_U "u") + (VSUBQ_S "s") (VSUBQ_U "u") (VADDVAQ_S "s") + (VADDVAQ_U "u") (VADDLVAQ_S "s") (VADDLVAQ_U "u") + (VBICQ_N_S "s") (VBICQ_N_U "u") (VMLALDAVQ_U "u") + (VMLALDAVQ_S "s") (VMLALDAVXQ_U "u") (VMLALDAVXQ_S "s") + (VMOVNBQ_U "u") (VMOVNBQ_S "s") (VMOVNTQ_U "u") + (VMOVNTQ_S "s") (VORRQ_N_S "s") (VORRQ_N_U "u") + (VQMOVNBQ_U "u") (VQMOVNBQ_S "s") (VQMOVNTQ_S "s") + (VQMOVNTQ_U "u") (VSHLLBQ_N_U "u") (VSHLLBQ_N_S "s") + (VSHLLTQ_N_U "u") (VSHLLTQ_N_S "s") (VRMLALDAVHQ_U "u") + (VRMLALDAVHQ_S "s") (VBICQ_M_N_S "s") (VBICQ_M_N_U "u") + (VCVTAQ_M_S "s") (VCVTAQ_M_U "u") (VCVTQ_M_TO_F_S "s") + (VCVTQ_M_TO_F_U "u") (VQRSHRNBQ_N_S "s") + (VQRSHRNBQ_N_U "u") (VABAVQ_S "s") (VABAVQ_U "u") + (VRMLALDAVHAQ_U "u") (VRMLALDAVHAQ_S "s") (VSHLCQ_S "s") + (VSHLCQ_U "u") (VADDVAQ_P_S "s") (VADDVAQ_P_U "u") + (VCLZQ_M_S "s") (VCLZQ_M_U "u") (VCMPEQQ_M_N_S "s") + (VCMPEQQ_M_N_U "u") (VCMPEQQ_M_S "s") (VCMPEQQ_M_U "u") + (VCMPNEQ_M_N_S "s") (VCMPNEQ_M_N_U "u") (VCMPNEQ_M_S "s") + (VCMPNEQ_M_U "u") (VDUPQ_M_N_S "s") (VDUPQ_M_N_U "u") + (VMAXVQ_P_S "s") (VMAXVQ_P_U "u") (VMINVQ_P_S "s") + (VMINVQ_P_U "u") (VMLADAVAQ_S "s") (VMLADAVAQ_U "u") + (VMLADAVQ_P_S "s") (VMLADAVQ_P_U "u") (VMLAQ_N_S "s") + (VMLAQ_N_U "u") (VMLASQ_N_S "s") (VMLASQ_N_U "u") + (VMVNQ_M_S "s") (VMVNQ_M_U "u") (VPSELQ_S "s") + (VPSELQ_U "u") (VQDMLAHQ_N_S "s") + (VQDMLASHQ_N_S "s") + (VQRDMLAHQ_N_S "s") + (VQRDMLASHQ_N_S "s") + (VQRSHLQ_M_N_S "s") (VQRSHLQ_M_N_U "u") + (VQSHLQ_M_R_S "s") (VQSHLQ_M_R_U "u") (VSRIQ_N_S "s") + (VREV64Q_M_S "s") (VREV64Q_M_U "u") (VSRIQ_N_U "u") + (VRSHLQ_M_N_S "s") (VRSHLQ_M_N_U "u") (VSHLQ_M_R_S "s") + (VSHLQ_M_R_U "u") (VSLIQ_N_S "s") (VSLIQ_N_U "u") + (VMLALDAVQ_P_S "s") (VQMOVNBQ_M_S "s") (VMOVLTQ_M_S "s") + (VMOVNBQ_M_S "s") (VRSHRNTQ_N_S "s") (VORRQ_M_N_S "s") + (VREV32Q_M_S "s") (VQRSHRNTQ_N_S "s") (VMOVNTQ_M_S "s") + (VMOVLBQ_M_S "s") (VMLALDAVAQ_S "s") (VQSHRNBQ_N_S "s") + (VSHRNBQ_N_S "s") (VRSHRNBQ_N_S "s") (VMLALDAVXQ_P_S "s") + (VQMOVNTQ_M_S "s") (VMVNQ_M_N_S "s") (VQSHRNTQ_N_S "s") + (VMLALDAVAXQ_S "s") (VSHRNTQ_N_S "s") (VMLALDAVQ_P_U "u") + (VQMOVNBQ_M_U "u") (VMOVLTQ_M_U "u") (VMOVNBQ_M_U "u") + (VRSHRNTQ_N_U "u") (VORRQ_M_N_U "u") (VREV32Q_M_U "u") + (VREV16Q_M_S "s") (VREV16Q_M_U "u") + (VQRSHRNTQ_N_U "u") (VMOVNTQ_M_U "u") (VMOVLBQ_M_U "u") + (VMLALDAVAQ_U "u") (VQSHRNBQ_N_U "u") (VSHRNBQ_N_U "u") + (VRSHRNBQ_N_U "u") (VMLALDAVXQ_P_U "u") + (VMVNQ_M_N_U "u") (VQSHRNTQ_N_U "u") (VMLALDAVAXQ_U "u") + (VQMOVNTQ_M_U "u") (VSHRNTQ_N_U "u") (VCVTMQ_M_S "s") + (VCVTMQ_M_U "u") (VCVTNQ_M_S "s") (VCVTNQ_M_U "u") + (VCVTPQ_M_S "s") (VCVTPQ_M_U "u") (VADDLVAQ_P_S "s") + (VCVTQ_M_N_FROM_F_U "u") (VCVTQ_M_FROM_F_S "s") + (VCVTQ_M_FROM_F_U "u") (VRMLALDAVHQ_P_U "u") + (VRMLALDAVHQ_P_S "s") (VADDLVAQ_P_U "u") + (VCVTQ_M_N_FROM_F_S "s") (VABAVQ_P_U "u") + (VABAVQ_P_S "s") (VSHLQ_M_S "s") (VSHLQ_M_U "u") + (VSRIQ_M_N_S "s") (VSRIQ_M_N_U "u") (VSUBQ_M_S "s") + (VSUBQ_M_U "u") (VCVTQ_M_N_TO_F_S "s") + (VCVTQ_M_N_TO_F_U "u") (VADDQ_M_N_U "u") + (VSHLQ_M_N_S "s") (VMAXQ_M_U "u") (VHSUBQ_M_N_U "u") + (VMULQ_M_N_S "s") (VQSHLQ_M_U "u") (VRHADDQ_M_S "s") + (VEORQ_M_U "u") (VSHRQ_M_N_U "u") (VCADDQ_ROT90_M_U "u") + (VMLADAVAQ_P_U "u") (VEORQ_M_S "s") (VBRSRQ_M_N_S "s") + (VMULQ_M_U "u") (VQRDMLAHQ_M_N_S "s") (VHSUBQ_M_N_S "s") + (VQRSHLQ_M_S "s") (VMULQ_M_N_U "u") + (VMULQ_M_S "s") (VQSHLQ_M_N_U "u") (VSLIQ_M_N_U "u") + (VMLADAVAQ_P_S "s") (VQRSHLQ_M_U "u") + (VMULLBQ_INT_M_U "u") (VSHLQ_M_N_U "u") (VQSUBQ_M_U "u") + (VQDMLASHQ_M_N_S "s") + (VQRDMLASHQ_M_N_U "u") (VRSHRQ_M_N_S "s") + (VORNQ_M_S "s") (VCADDQ_ROT270_M_S "s") (VRHADDQ_M_U "u") + (VRSHRQ_M_N_U "u") (VMLASQ_M_N_U "u") (VHSUBQ_M_U "u") + (VQSUBQ_M_N_S "s") (VMULLTQ_INT_M_S "s") + (VORRQ_M_S "s") (VQDMLAHQ_M_N_U "u") (VRSHLQ_M_S "s") + (VHADDQ_M_U "u") (VHADDQ_M_N_S "s") (VMULLTQ_INT_M_U "u") + (VORRQ_M_U "u") (VHADDQ_M_S "s") (VHADDQ_M_N_U "u") + (VQDMLAHQ_M_N_S "s") (VMAXQ_M_S "s") (VORNQ_M_U "u") + (VCADDQ_ROT270_M_U "u") (VQADDQ_M_U "u") + (VQRDMLASHQ_M_N_S "s") (VBICQ_M_U "u") (VMINQ_M_U "u") + (VSUBQ_M_N_S "s") (VMULLBQ_INT_M_S "s") (VQSUBQ_M_S "s") + (VCADDQ_ROT90_M_S "s") (VRMULHQ_M_S "s") (VANDQ_M_U "u") + (VMULHQ_M_S "s") (VADDQ_M_S "s") (VQRDMLAHQ_M_N_U "u") + (VMLASQ_M_N_S "s") (VHSUBQ_M_S "s") (VRMULHQ_M_U "u") + (VQADDQ_M_N_S "s") (VSHRQ_M_N_S "s") (VANDQ_M_S "s") + (VABDQ_M_U "u") (VQSHLQ_M_S "s") (VABDQ_M_S "s") + (VSUBQ_M_N_U "u") (VMLAQ_M_N_S "s") (VBRSRQ_M_N_U "u") + (VADDQ_M_U "u") (VRSHLQ_M_U "u") (VSLIQ_M_N_S "s") + (VQADDQ_M_N_U "u") (VADDQ_M_N_S "s") (VQSUBQ_M_N_U "u") + (VMLAQ_M_N_U "u") (VMINQ_M_S "s") (VMULHQ_M_U "u") + (VQADDQ_M_S "s") (VBICQ_M_S "s") (VQSHLQ_M_N_S "s") + (VQSHRNTQ_M_N_S "s") (VQSHRNTQ_M_N_U "u") + (VSHRNTQ_M_N_U "u") (VSHRNTQ_M_N_S "s") + (VSHRNBQ_M_N_S "s") (VSHRNBQ_M_N_U "u") + (VSHLLTQ_M_N_S "s") (VSHLLTQ_M_N_U "u") + (VSHLLBQ_M_N_S "s") (VSHLLBQ_M_N_U "u") + (VRSHRNTQ_M_N_S "s") (VRSHRNTQ_M_N_U "u") + (VRSHRNBQ_M_N_U "u") (VRSHRNBQ_M_N_S "s") + (VQSHRNTQ_M_N_U "u") (VQSHRNTQ_M_N_S "s") + (VQSHRNBQ_M_N_S "s") (VQSHRNBQ_M_N_U "u") + (VQRSHRNTQ_M_N_S "s") (VQRSHRNTQ_M_N_U "u") + (VQRSHRNBQ_M_N_S "s") (VQRSHRNBQ_M_N_U "u") + (VMLALDAVAXQ_P_S "s") + (VMLALDAVAQ_P_S "s") (VMLALDAVAQ_P_U "u") + (VSTRWQSB_S "s") (VSTRWQSB_U "u") (VSTRBQSO_S "s") + (VSTRBQSO_U "u") (VSTRBQ_S "s") (VSTRBQ_U "u") + (VLDRBQGO_S "s") (VLDRBQGO_U "u") (VLDRBQ_S "s") + (VLDRBQ_U "u") (VLDRWQGB_S "s") (VLDRWQGB_U "u") + (VLD1Q_S "s") (VLD1Q_U "u") (VLDRHQGO_S "s") + (VLDRHQGO_U "u") (VLDRHQGSO_S "s") (VLDRHQGSO_U "u") + (VLDRHQ_S "s") (VLDRHQ_U "u") (VLDRWQ_S "s") + (VLDRWQ_U "u") (VLDRDQGB_S "s") (VLDRDQGB_U "u") + (VLDRDQGO_S "s") (VLDRDQGO_U "u") (VLDRDQGSO_S "s") + (VLDRDQGSO_U "u") (VLDRWQGO_S "s") (VLDRWQGO_U "u") + (VLDRWQGSO_S "s") (VLDRWQGSO_U "u") (VST1Q_S "s") + (VST1Q_U "u") (VSTRHQSO_S "s") (VSTRHQSO_U "u") + (VSTRHQSSO_S "s") (VSTRHQSSO_U "u") (VSTRHQ_S "s") + (VSTRHQ_U "u") (VSTRWQ_S "s") (VSTRWQ_U "u") + (VSTRDQSB_S "s") (VSTRDQSB_U "u") (VSTRDQSO_S "s") + (VSTRDQSO_U "u") (VSTRDQSSO_S "s") (VSTRDQSSO_U "u") + (VSTRWQSO_U "u") (VSTRWQSO_S "s") (VSTRWQSSO_U "u") + (VSTRWQSSO_S "s") (VSTRWQSBWB_S "s") (VSTRWQSBWB_U "u") + (VLDRWQGBWB_S "s") (VLDRWQGBWB_U "u") (VLDRDQGBWB_S "s") + (VLDRDQGBWB_U "u") (VSTRDQSBWB_S "s") (VADCQ_M_S "s") + (VSTRDQSBWB_U "u") (VSBCQ_U "u") (VSBCQ_M_U "u") + (VSBCQ_S "s") (VSBCQ_M_S "s") (VSBCIQ_U "u") + (VSBCIQ_M_U "u") (VSBCIQ_S "s") (VSBCIQ_M_S "s") + (VADCQ_U "u") (VADCQ_M_U "u") (VADCQ_S "s") + (VADCIQ_U "u") (VADCIQ_M_U "u") (VADCIQ_S "s") + (VADCIQ_M_S "s") (SQRSHRL_64 "64") (SQRSHRL_48 "48") + (UQRSHLL_64 "64") (UQRSHLL_48 "48") (VSHLCQ_M_S "s") + (VSHLCQ_M_U "u")]) + +(define_int_attr mode1 [(VCTP8Q "8") (VCTP16Q "16") (VCTP32Q "32") + (VCTP64Q "64") (VCTP8Q_M "8") (VCTP16Q_M "16") + (VCTP32Q_M "32") (VCTP64Q_M "64")]) ;; Both kinds of return insn. (define_code_iterator RETURNS [return simple_return]) @@ -1256,3 +1469,249 @@ ;; An iterator for CDE MVE accumulator/non-accumulator versions. (define_int_attr a [(UNSPEC_VCDE "") (UNSPEC_VCDEA "a")]) + +;; MVE int iterator. +(define_int_iterator VCVTQ_TO_F [VCVTQ_TO_F_S VCVTQ_TO_F_U]) +(define_int_iterator VMVNQ_N [VMVNQ_N_U VMVNQ_N_S]) +(define_int_iterator VREV64Q [VREV64Q_S VREV64Q_U]) +(define_int_iterator VCVTQ_FROM_F [VCVTQ_FROM_F_S VCVTQ_FROM_F_U]) +(define_int_iterator VREV16Q [VREV16Q_U VREV16Q_S]) +(define_int_iterator VCVTAQ [VCVTAQ_U VCVTAQ_S]) +(define_int_iterator VMVNQ [VMVNQ_U VMVNQ_S]) +(define_int_iterator VDUPQ_N [VDUPQ_N_U VDUPQ_N_S]) +(define_int_iterator VCLZQ [VCLZQ_U VCLZQ_S]) +(define_int_iterator VADDVQ [VADDVQ_U VADDVQ_S]) +(define_int_iterator VREV32Q [VREV32Q_U VREV32Q_S]) +(define_int_iterator VMOVLBQ [VMOVLBQ_S VMOVLBQ_U]) +(define_int_iterator VMOVLTQ [VMOVLTQ_U VMOVLTQ_S]) +(define_int_iterator VCVTPQ [VCVTPQ_S VCVTPQ_U]) +(define_int_iterator VCVTNQ [VCVTNQ_S VCVTNQ_U]) +(define_int_iterator VCVTMQ [VCVTMQ_S VCVTMQ_U]) +(define_int_iterator VADDLVQ [VADDLVQ_U VADDLVQ_S]) +(define_int_iterator VCTPQ [VCTP8Q VCTP16Q VCTP32Q VCTP64Q]) +(define_int_iterator VCTPQ_M [VCTP8Q_M VCTP16Q_M VCTP32Q_M VCTP64Q_M]) +(define_int_iterator VCVTQ_N_TO_F [VCVTQ_N_TO_F_S VCVTQ_N_TO_F_U]) +(define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S]) +(define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U]) +(define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U]) +(define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U]) +(define_int_iterator VCMPNEQ [VCMPNEQ_U VCMPNEQ_S]) +(define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U]) +(define_int_iterator VABDQ [VABDQ_S VABDQ_U]) +(define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U]) +(define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U]) +(define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S]) +(define_int_iterator VANDQ [VANDQ_U VANDQ_S]) +(define_int_iterator VBICQ [VBICQ_S VBICQ_U]) +(define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S]) +(define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U]) +(define_int_iterator VCADDQ_ROT90 [VCADDQ_ROT90_U VCADDQ_ROT90_S]) +(define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S]) +(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U]) +(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S]) +(define_int_iterator VEORQ [VEORQ_U VEORQ_S]) +(define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U]) +(define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S]) +(define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U]) +(define_int_iterator VHSUBQ_N [VHSUBQ_N_U VHSUBQ_N_S]) +(define_int_iterator VMAXQ [VMAXQ_U VMAXQ_S]) +(define_int_iterator VMAXVQ [VMAXVQ_U VMAXVQ_S]) +(define_int_iterator VMINQ [VMINQ_S VMINQ_U]) +(define_int_iterator VMINVQ [VMINVQ_U VMINVQ_S]) +(define_int_iterator VMLADAVQ [VMLADAVQ_U VMLADAVQ_S]) +(define_int_iterator VMULHQ [VMULHQ_S VMULHQ_U]) +(define_int_iterator VMULLBQ_INT [VMULLBQ_INT_U VMULLBQ_INT_S]) +(define_int_iterator VMULLTQ_INT [VMULLTQ_INT_U VMULLTQ_INT_S]) +(define_int_iterator VMULQ [VMULQ_U VMULQ_S]) +(define_int_iterator VMULQ_N [VMULQ_N_U VMULQ_N_S]) +(define_int_iterator VORNQ [VORNQ_U VORNQ_S]) +(define_int_iterator VORRQ [VORRQ_S VORRQ_U]) +(define_int_iterator VQADDQ [VQADDQ_U VQADDQ_S]) +(define_int_iterator VQADDQ_N [VQADDQ_N_S VQADDQ_N_U]) +(define_int_iterator VQRSHLQ [VQRSHLQ_S VQRSHLQ_U]) +(define_int_iterator VQRSHLQ_N [VQRSHLQ_N_S VQRSHLQ_N_U]) +(define_int_iterator VQSHLQ [VQSHLQ_S VQSHLQ_U]) +(define_int_iterator VQSHLQ_N [VQSHLQ_N_S VQSHLQ_N_U]) +(define_int_iterator VQSHLQ_R [VQSHLQ_R_U VQSHLQ_R_S]) +(define_int_iterator VQSUBQ [VQSUBQ_U VQSUBQ_S]) +(define_int_iterator VQSUBQ_N [VQSUBQ_N_S VQSUBQ_N_U]) +(define_int_iterator VRHADDQ [VRHADDQ_S VRHADDQ_U]) +(define_int_iterator VRMULHQ [VRMULHQ_S VRMULHQ_U]) +(define_int_iterator VRSHLQ [VRSHLQ_S VRSHLQ_U]) +(define_int_iterator VRSHLQ_N [VRSHLQ_N_U VRSHLQ_N_S]) +(define_int_iterator VRSHRQ_N [VRSHRQ_N_S VRSHRQ_N_U]) +(define_int_iterator VSHLQ_N [VSHLQ_N_U VSHLQ_N_S]) +(define_int_iterator VSHLQ_R [VSHLQ_R_S VSHLQ_R_U]) +(define_int_iterator VSUBQ [VSUBQ_S VSUBQ_U]) +(define_int_iterator VSUBQ_N [VSUBQ_N_S VSUBQ_N_U]) +(define_int_iterator VADDLVAQ [VADDLVAQ_S VADDLVAQ_U]) +(define_int_iterator VBICQ_N [VBICQ_N_S VBICQ_N_U]) +(define_int_iterator VMLALDAVQ [VMLALDAVQ_U VMLALDAVQ_S]) +(define_int_iterator VMLALDAVXQ [VMLALDAVXQ_U VMLALDAVXQ_S]) +(define_int_iterator VMOVNBQ [VMOVNBQ_U VMOVNBQ_S]) +(define_int_iterator VMOVNTQ [VMOVNTQ_S VMOVNTQ_U]) +(define_int_iterator VORRQ_N [VORRQ_N_U VORRQ_N_S]) +(define_int_iterator VQMOVNBQ [VQMOVNBQ_U VQMOVNBQ_S]) +(define_int_iterator VQMOVNTQ [VQMOVNTQ_U VQMOVNTQ_S]) +(define_int_iterator VSHLLBQ_N [VSHLLBQ_N_S VSHLLBQ_N_U]) +(define_int_iterator VSHLLTQ_N [VSHLLTQ_N_U VSHLLTQ_N_S]) +(define_int_iterator VRMLALDAVHQ [VRMLALDAVHQ_U VRMLALDAVHQ_S]) +(define_int_iterator VBICQ_M_N [VBICQ_M_N_S VBICQ_M_N_U]) +(define_int_iterator VCVTAQ_M [VCVTAQ_M_S VCVTAQ_M_U]) +(define_int_iterator VCVTQ_M_TO_F [VCVTQ_M_TO_F_S VCVTQ_M_TO_F_U]) +(define_int_iterator VQRSHRNBQ_N [VQRSHRNBQ_N_U VQRSHRNBQ_N_S]) +(define_int_iterator VABAVQ [VABAVQ_S VABAVQ_U]) +(define_int_iterator VSHLCQ [VSHLCQ_S VSHLCQ_U]) +(define_int_iterator VRMLALDAVHAQ [VRMLALDAVHAQ_S VRMLALDAVHAQ_U]) +(define_int_iterator VADDVAQ_P [VADDVAQ_P_S VADDVAQ_P_U]) +(define_int_iterator VCLZQ_M [VCLZQ_M_S VCLZQ_M_U]) +(define_int_iterator VCMPEQQ_M_N [VCMPEQQ_M_N_S VCMPEQQ_M_N_U]) +(define_int_iterator VCMPEQQ_M [VCMPEQQ_M_S VCMPEQQ_M_U]) +(define_int_iterator VCMPNEQ_M_N [VCMPNEQ_M_N_S VCMPNEQ_M_N_U]) +(define_int_iterator VCMPNEQ_M [VCMPNEQ_M_S VCMPNEQ_M_U]) +(define_int_iterator VDUPQ_M_N [VDUPQ_M_N_S VDUPQ_M_N_U]) +(define_int_iterator VMAXVQ_P [VMAXVQ_P_S VMAXVQ_P_U]) +(define_int_iterator VMINVQ_P [VMINVQ_P_S VMINVQ_P_U]) +(define_int_iterator VMLADAVAQ [VMLADAVAQ_S VMLADAVAQ_U]) +(define_int_iterator VMLADAVQ_P [VMLADAVQ_P_S VMLADAVQ_P_U]) +(define_int_iterator VMLAQ_N [VMLAQ_N_S VMLAQ_N_U]) +(define_int_iterator VMLASQ_N [VMLASQ_N_S VMLASQ_N_U]) +(define_int_iterator VMVNQ_M [VMVNQ_M_S VMVNQ_M_U]) +(define_int_iterator VPSELQ [VPSELQ_S VPSELQ_U]) +(define_int_iterator VQDMLAHQ_N [VQDMLAHQ_N_S]) +(define_int_iterator VQDMLASHQ_N [VQDMLASHQ_N_S]) +(define_int_iterator VQRDMLAHQ_N [VQRDMLAHQ_N_S]) +(define_int_iterator VQRDMLASHQ_N [VQRDMLASHQ_N_S]) +(define_int_iterator VQRSHLQ_M_N [VQRSHLQ_M_N_S VQRSHLQ_M_N_U]) +(define_int_iterator VQSHLQ_M_R [VQSHLQ_M_R_S VQSHLQ_M_R_U]) +(define_int_iterator VREV64Q_M [VREV64Q_M_S VREV64Q_M_U]) +(define_int_iterator VRSHLQ_M_N [VRSHLQ_M_N_S VRSHLQ_M_N_U]) +(define_int_iterator VSHLQ_M_R [VSHLQ_M_R_S VSHLQ_M_R_U]) +(define_int_iterator VSLIQ_N [VSLIQ_N_S VSLIQ_N_U]) +(define_int_iterator VSRIQ_N [VSRIQ_N_S VSRIQ_N_U]) +(define_int_iterator VMLALDAVQ_P [VMLALDAVQ_P_U VMLALDAVQ_P_S]) +(define_int_iterator VQMOVNBQ_M [VQMOVNBQ_M_S VQMOVNBQ_M_U]) +(define_int_iterator VMOVLTQ_M [VMOVLTQ_M_U VMOVLTQ_M_S]) +(define_int_iterator VMOVNBQ_M [VMOVNBQ_M_U VMOVNBQ_M_S]) +(define_int_iterator VRSHRNTQ_N [VRSHRNTQ_N_U VRSHRNTQ_N_S]) +(define_int_iterator VORRQ_M_N [VORRQ_M_N_S VORRQ_M_N_U]) +(define_int_iterator VREV32Q_M [VREV32Q_M_S VREV32Q_M_U]) +(define_int_iterator VREV16Q_M [VREV16Q_M_S VREV16Q_M_U]) +(define_int_iterator VQRSHRNTQ_N [VQRSHRNTQ_N_U VQRSHRNTQ_N_S]) +(define_int_iterator VMOVNTQ_M [VMOVNTQ_M_U VMOVNTQ_M_S]) +(define_int_iterator VMOVLBQ_M [VMOVLBQ_M_U VMOVLBQ_M_S]) +(define_int_iterator VMLALDAVAQ [VMLALDAVAQ_S VMLALDAVAQ_U]) +(define_int_iterator VQSHRNBQ_N [VQSHRNBQ_N_U VQSHRNBQ_N_S]) +(define_int_iterator VSHRNBQ_N [VSHRNBQ_N_U VSHRNBQ_N_S]) +(define_int_iterator VRSHRNBQ_N [VRSHRNBQ_N_S VRSHRNBQ_N_U]) +(define_int_iterator VMLALDAVXQ_P [VMLALDAVXQ_P_U VMLALDAVXQ_P_S]) +(define_int_iterator VQMOVNTQ_M [VQMOVNTQ_M_U VQMOVNTQ_M_S]) +(define_int_iterator VMVNQ_M_N [VMVNQ_M_N_U VMVNQ_M_N_S]) +(define_int_iterator VQSHRNTQ_N [VQSHRNTQ_N_U VQSHRNTQ_N_S]) +(define_int_iterator VMLALDAVAXQ [VMLALDAVAXQ_S VMLALDAVAXQ_U]) +(define_int_iterator VSHRNTQ_N [VSHRNTQ_N_S VSHRNTQ_N_U]) +(define_int_iterator VCVTMQ_M [VCVTMQ_M_S VCVTMQ_M_U]) +(define_int_iterator VCVTNQ_M [VCVTNQ_M_S VCVTNQ_M_U]) +(define_int_iterator VCVTPQ_M [VCVTPQ_M_S VCVTPQ_M_U]) +(define_int_iterator VCVTQ_M_N_FROM_F [VCVTQ_M_N_FROM_F_S VCVTQ_M_N_FROM_F_U]) +(define_int_iterator VCVTQ_M_FROM_F [VCVTQ_M_FROM_F_U VCVTQ_M_FROM_F_S]) +(define_int_iterator VRMLALDAVHQ_P [VRMLALDAVHQ_P_S VRMLALDAVHQ_P_U]) +(define_int_iterator VADDLVAQ_P [VADDLVAQ_P_U VADDLVAQ_P_S]) +(define_int_iterator VABAVQ_P [VABAVQ_P_S VABAVQ_P_U]) +(define_int_iterator VSHLQ_M [VSHLQ_M_S VSHLQ_M_U]) +(define_int_iterator VSRIQ_M_N [VSRIQ_M_N_S VSRIQ_M_N_U]) +(define_int_iterator VSUBQ_M [VSUBQ_M_U VSUBQ_M_S]) +(define_int_iterator VCVTQ_M_N_TO_F [VCVTQ_M_N_TO_F_U VCVTQ_M_N_TO_F_S]) +(define_int_iterator VHSUBQ_M [VHSUBQ_M_S VHSUBQ_M_U]) +(define_int_iterator VSLIQ_M_N [VSLIQ_M_N_U VSLIQ_M_N_S]) +(define_int_iterator VRSHLQ_M [VRSHLQ_M_S VRSHLQ_M_U]) +(define_int_iterator VMINQ_M [VMINQ_M_S VMINQ_M_U]) +(define_int_iterator VMULLBQ_INT_M [VMULLBQ_INT_M_U VMULLBQ_INT_M_S]) +(define_int_iterator VMULHQ_M [VMULHQ_M_S VMULHQ_M_U]) +(define_int_iterator VMULQ_M [VMULQ_M_S VMULQ_M_U]) +(define_int_iterator VHSUBQ_M_N [VHSUBQ_M_N_S VHSUBQ_M_N_U]) +(define_int_iterator VHADDQ_M_N [VHADDQ_M_N_S VHADDQ_M_N_U]) +(define_int_iterator VORRQ_M [VORRQ_M_S VORRQ_M_U]) +(define_int_iterator VRMULHQ_M [VRMULHQ_M_U VRMULHQ_M_S]) +(define_int_iterator VQADDQ_M [VQADDQ_M_U VQADDQ_M_S]) +(define_int_iterator VRSHRQ_M_N [VRSHRQ_M_N_S VRSHRQ_M_N_U]) +(define_int_iterator VQSUBQ_M_N [VQSUBQ_M_N_U VQSUBQ_M_N_S]) +(define_int_iterator VADDQ_M [VADDQ_M_U VADDQ_M_S]) +(define_int_iterator VORNQ_M [VORNQ_M_U VORNQ_M_S]) +(define_int_iterator VRHADDQ_M [VRHADDQ_M_U VRHADDQ_M_S]) +(define_int_iterator VQSHLQ_M [VQSHLQ_M_U VQSHLQ_M_S]) +(define_int_iterator VANDQ_M [VANDQ_M_U VANDQ_M_S]) +(define_int_iterator VBICQ_M [VBICQ_M_U VBICQ_M_S]) +(define_int_iterator VSHLQ_M_N [VSHLQ_M_N_S VSHLQ_M_N_U]) +(define_int_iterator VCADDQ_ROT270_M [VCADDQ_ROT270_M_U VCADDQ_ROT270_M_S]) +(define_int_iterator VQRSHLQ_M [VQRSHLQ_M_U VQRSHLQ_M_S]) +(define_int_iterator VQADDQ_M_N [VQADDQ_M_N_U VQADDQ_M_N_S]) +(define_int_iterator VADDQ_M_N [VADDQ_M_N_S VADDQ_M_N_U]) +(define_int_iterator VMAXQ_M [VMAXQ_M_S VMAXQ_M_U]) +(define_int_iterator VQSUBQ_M [VQSUBQ_M_U VQSUBQ_M_S]) +(define_int_iterator VMLASQ_M_N [VMLASQ_M_N_U VMLASQ_M_N_S]) +(define_int_iterator VMLADAVAQ_P [VMLADAVAQ_P_U VMLADAVAQ_P_S]) +(define_int_iterator VBRSRQ_M_N [VBRSRQ_M_N_U VBRSRQ_M_N_S]) +(define_int_iterator VMULQ_M_N [VMULQ_M_N_U VMULQ_M_N_S]) +(define_int_iterator VCADDQ_ROT90_M [VCADDQ_ROT90_M_U VCADDQ_ROT90_M_S]) +(define_int_iterator VMULLTQ_INT_M [VMULLTQ_INT_M_S VMULLTQ_INT_M_U]) +(define_int_iterator VEORQ_M [VEORQ_M_S VEORQ_M_U]) +(define_int_iterator VSHRQ_M_N [VSHRQ_M_N_S VSHRQ_M_N_U]) +(define_int_iterator VSUBQ_M_N [VSUBQ_M_N_S VSUBQ_M_N_U]) +(define_int_iterator VHADDQ_M [VHADDQ_M_S VHADDQ_M_U]) +(define_int_iterator VABDQ_M [VABDQ_M_S VABDQ_M_U]) +(define_int_iterator VMLAQ_M_N [VMLAQ_M_N_S VMLAQ_M_N_U]) +(define_int_iterator VQSHLQ_M_N [VQSHLQ_M_N_S VQSHLQ_M_N_U]) +(define_int_iterator VMLALDAVAQ_P [VMLALDAVAQ_P_U VMLALDAVAQ_P_S]) +(define_int_iterator VMLALDAVAXQ_P [VMLALDAVAXQ_P_S]) +(define_int_iterator VQRSHRNBQ_M_N [VQRSHRNBQ_M_N_U VQRSHRNBQ_M_N_S]) +(define_int_iterator VQRSHRNTQ_M_N [VQRSHRNTQ_M_N_S VQRSHRNTQ_M_N_U]) +(define_int_iterator VQSHRNBQ_M_N [VQSHRNBQ_M_N_U VQSHRNBQ_M_N_S]) +(define_int_iterator VQSHRNTQ_M_N [VQSHRNTQ_M_N_S VQSHRNTQ_M_N_U]) +(define_int_iterator VRSHRNBQ_M_N [VRSHRNBQ_M_N_U VRSHRNBQ_M_N_S]) +(define_int_iterator VRSHRNTQ_M_N [VRSHRNTQ_M_N_U VRSHRNTQ_M_N_S]) +(define_int_iterator VSHLLBQ_M_N [VSHLLBQ_M_N_U VSHLLBQ_M_N_S]) +(define_int_iterator VSHLLTQ_M_N [VSHLLTQ_M_N_U VSHLLTQ_M_N_S]) +(define_int_iterator VSHRNBQ_M_N [VSHRNBQ_M_N_S VSHRNBQ_M_N_U]) +(define_int_iterator VSHRNTQ_M_N [VSHRNTQ_M_N_S VSHRNTQ_M_N_U]) +(define_int_iterator VSTRWSBQ [VSTRWQSB_S VSTRWQSB_U]) +(define_int_iterator VSTRBSOQ [VSTRBQSO_S VSTRBQSO_U]) +(define_int_iterator VSTRBQ [VSTRBQ_S VSTRBQ_U]) +(define_int_iterator VLDRBGOQ [VLDRBQGO_S VLDRBQGO_U]) +(define_int_iterator VLDRBQ [VLDRBQ_S VLDRBQ_U]) +(define_int_iterator VLDRWGBQ [VLDRWQGB_S VLDRWQGB_U]) +(define_int_iterator VLD1Q [VLD1Q_S VLD1Q_U]) +(define_int_iterator VLDRHGOQ [VLDRHQGO_S VLDRHQGO_U]) +(define_int_iterator VLDRHGSOQ [VLDRHQGSO_S VLDRHQGSO_U]) +(define_int_iterator VLDRHQ [VLDRHQ_S VLDRHQ_U]) +(define_int_iterator VLDRWQ [VLDRWQ_S VLDRWQ_U]) +(define_int_iterator VLDRDGBQ [VLDRDQGB_S VLDRDQGB_U]) +(define_int_iterator VLDRDGOQ [VLDRDQGO_S VLDRDQGO_U]) +(define_int_iterator VLDRDGSOQ [VLDRDQGSO_S VLDRDQGSO_U]) +(define_int_iterator VLDRWGOQ [VLDRWQGO_S VLDRWQGO_U]) +(define_int_iterator VLDRWGSOQ [VLDRWQGSO_S VLDRWQGSO_U]) +(define_int_iterator VST1Q [VST1Q_S VST1Q_U]) +(define_int_iterator VSTRHSOQ [VSTRHQSO_S VSTRHQSO_U]) +(define_int_iterator VSTRHSSOQ [VSTRHQSSO_S VSTRHQSSO_U]) +(define_int_iterator VSTRHQ [VSTRHQ_S VSTRHQ_U]) +(define_int_iterator VSTRWQ [VSTRWQ_S VSTRWQ_U]) +(define_int_iterator VSTRDSBQ [VSTRDQSB_S VSTRDQSB_U]) +(define_int_iterator VSTRDSOQ [VSTRDQSO_S VSTRDQSO_U]) +(define_int_iterator VSTRDSSOQ [VSTRDQSSO_S VSTRDQSSO_U]) +(define_int_iterator VSTRWSOQ [VSTRWQSO_S VSTRWQSO_U]) +(define_int_iterator VSTRWSSOQ [VSTRWQSSO_S VSTRWQSSO_U]) +(define_int_iterator VSTRWSBWBQ [VSTRWQSBWB_S VSTRWQSBWB_U]) +(define_int_iterator VLDRWGBWBQ [VLDRWQGBWB_S VLDRWQGBWB_U]) +(define_int_iterator VSTRDSBWBQ [VSTRDQSBWB_S VSTRDQSBWB_U]) +(define_int_iterator VLDRDGBWBQ [VLDRDQGBWB_S VLDRDQGBWB_U]) +(define_int_iterator VADCIQ [VADCIQ_U VADCIQ_S]) +(define_int_iterator VADCIQ_M [VADCIQ_M_U VADCIQ_M_S]) +(define_int_iterator VSBCQ [VSBCQ_U VSBCQ_S]) +(define_int_iterator VSBCQ_M [VSBCQ_M_U VSBCQ_M_S]) +(define_int_iterator VSBCIQ [VSBCIQ_U VSBCIQ_S]) +(define_int_iterator VSBCIQ_M [VSBCIQ_M_U VSBCIQ_M_S]) +(define_int_iterator VADCQ [VADCQ_U VADCQ_S]) +(define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S]) +(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48]) +(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48]) +(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 465b39a..ecbaaa9 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -17,654 +17,6 @@ ;; along with GCC; see the file COPYING3. If not see ;; <http://www.gnu.org/licenses/>. -(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF]) -(define_mode_iterator MVE_VLD_ST [V16QI V8HI V4SI V8HF V4SF]) -(define_mode_iterator MVE_0 [V8HF V4SF]) -(define_mode_iterator MVE_1 [V16QI V8HI V4SI V2DI]) -(define_mode_iterator MVE_3 [V16QI V8HI]) -(define_mode_iterator MVE_2 [V16QI V8HI V4SI]) -(define_mode_iterator MVE_5 [V8HI V4SI]) -(define_mode_iterator MVE_6 [V8HI V4SI]) - -(define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F - VRNDAQ_F VREV64Q_F VNEGQ_F VDUPQ_N_F VABSQ_F VREV32Q_F - VCVTTQ_F32_F16 VCVTBQ_F32_F16 VCVTQ_TO_F_S VQNEGQ_S - VCVTQ_TO_F_U VREV16Q_S VREV16Q_U VADDLVQ_S VMVNQ_N_S - VMVNQ_N_U VCVTAQ_S VCVTAQ_U VREV64Q_S VREV64Q_U - VQABSQ_S VNEGQ_S VMVNQ_S VMVNQ_U VDUPQ_N_U VDUPQ_N_S - VCLZQ_U VCLZQ_S VCLSQ_S VADDVQ_S VADDVQ_U VABSQ_S - VREV32Q_U VREV32Q_S VMOVLTQ_U VMOVLTQ_S VMOVLBQ_S - VMOVLBQ_U VCVTQ_FROM_F_S VCVTQ_FROM_F_U VCVTPQ_S - VCVTPQ_U VCVTNQ_S VCVTNQ_U VCVTMQ_S VCVTMQ_U - VADDLVQ_U VCTP8Q VCTP16Q VCTP32Q VCTP64Q VPNOT - VCREATEQ_F VCVTQ_N_TO_F_S VCVTQ_N_TO_F_U VBRSRQ_N_F - VSUBQ_N_F VCREATEQ_U VCREATEQ_S VSHRQ_N_S VSHRQ_N_U - VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U VADDLVQ_P_S - VADDLVQ_P_U VCMPNEQ_U VCMPNEQ_S VSHLQ_S VSHLQ_U VABDQ_S - VADDQ_N_S VADDVAQ_S VADDVQ_P_S VANDQ_S VBICQ_S - VBRSRQ_N_S VCADDQ_ROT270_S VCADDQ_ROT90_S VCMPEQQ_S - VCMPEQQ_N_S VCMPNEQ_N_S VEORQ_S VHADDQ_S VHADDQ_N_S - VHSUBQ_S VHSUBQ_N_S VMAXQ_S VMAXVQ_S VMINQ_S VMINVQ_S - VMLADAVQ_S VMULHQ_S VMULLBQ_INT_S VMULLTQ_INT_S VMULQ_S - VMULQ_N_S VORNQ_S VORRQ_S VQADDQ_S VQADDQ_N_S VQRSHLQ_S - VQRSHLQ_N_S VQSHLQ_S VQSHLQ_N_S VQSHLQ_R_S VQSUBQ_S - VQSUBQ_N_S VRHADDQ_S VRMULHQ_S VRSHLQ_S VRSHLQ_N_S - VRSHRQ_N_S VSHLQ_N_S VSHLQ_R_S VSUBQ_S VSUBQ_N_S - VABDQ_U VADDQ_N_U VADDVAQ_U VADDVQ_P_U VANDQ_U VBICQ_U - VBRSRQ_N_U VCADDQ_ROT270_U VCADDQ_ROT90_U VCMPEQQ_U - VCMPEQQ_N_U VCMPNEQ_N_U VEORQ_U VHADDQ_U VHADDQ_N_U - VHSUBQ_U VHSUBQ_N_U VMAXQ_U VMAXVQ_U VMINQ_U VMINVQ_U - VMLADAVQ_U VMULHQ_U VMULLBQ_INT_U VMULLTQ_INT_U VMULQ_U - VMULQ_N_U VORNQ_U VORRQ_U VQADDQ_U VQADDQ_N_U VQRSHLQ_U - VQRSHLQ_N_U VQSHLQ_U VQSHLQ_N_U VQSHLQ_R_U VQSUBQ_U - VQSUBQ_N_U VRHADDQ_U VRMULHQ_U VRSHLQ_U VRSHLQ_N_U - VRSHRQ_N_U VSHLQ_N_U VSHLQ_R_U VSUBQ_U VSUBQ_N_U - VCMPGEQ_N_S VCMPGEQ_S VCMPGTQ_N_S VCMPGTQ_S VCMPLEQ_N_S - VCMPLEQ_S VCMPLTQ_N_S VCMPLTQ_S VHCADDQ_ROT270_S - VHCADDQ_ROT90_S VMAXAQ_S VMAXAVQ_S VMINAQ_S VMINAVQ_S - VMLADAVXQ_S VMLSDAVQ_S VMLSDAVXQ_S VQDMULHQ_N_S - VQDMULHQ_S VQRDMULHQ_N_S VQRDMULHQ_S VQSHLUQ_N_S - VCMPCSQ_N_U VCMPCSQ_U VCMPHIQ_N_U VCMPHIQ_U VABDQ_M_S - VABDQ_M_U VABDQ_F VADDQ_N_F VANDQ_F VBICQ_F - VCADDQ_ROT270_F VCADDQ_ROT90_F VCMPEQQ_F VCMPEQQ_N_F - VCMPGEQ_F VCMPGEQ_N_F VCMPGTQ_F VCMPGTQ_N_F VCMPLEQ_F - VCMPLEQ_N_F VCMPLTQ_F VCMPLTQ_N_F VCMPNEQ_F VCMPNEQ_N_F - VCMULQ_F VCMULQ_ROT180_F VCMULQ_ROT270_F VCMULQ_ROT90_F - VEORQ_F VMAXNMAQ_F VMAXNMAVQ_F VMAXNMQ_F VMAXNMVQ_F - VMINNMAQ_F VMINNMAVQ_F VMINNMQ_F VMINNMVQ_F VMULQ_F - VMULQ_N_F VORNQ_F VORRQ_F VSUBQ_F VADDLVAQ_U - VADDLVAQ_S VBICQ_N_U VBICQ_N_S VCTP8Q_M VCTP16Q_M - VCTP32Q_M VCTP64Q_M VCVTBQ_F16_F32 VCVTTQ_F16_F32 - VMLALDAVQ_U VMLALDAVXQ_U VMLALDAVXQ_S VMLALDAVQ_S - VMLSLDAVQ_S VMLSLDAVXQ_S VMOVNBQ_U VMOVNBQ_S - VMOVNTQ_U VMOVNTQ_S VORRQ_N_S VORRQ_N_U VQDMULLBQ_N_S - VQDMULLBQ_S VQDMULLTQ_N_S VQDMULLTQ_S VQMOVNBQ_U - VQMOVNBQ_S VQMOVUNBQ_S VQMOVUNTQ_S VRMLALDAVHXQ_S - VRMLSLDAVHQ_S VRMLSLDAVHXQ_S VSHLLBQ_S - VSHLLBQ_U VSHLLTQ_U VSHLLTQ_S VQMOVNTQ_U VQMOVNTQ_S - VSHLLBQ_N_S VSHLLBQ_N_U VSHLLTQ_N_U VSHLLTQ_N_S - VRMLALDAVHQ_U VRMLALDAVHQ_S VMULLTQ_POLY_P - VMULLBQ_POLY_P VBICQ_M_N_S VBICQ_M_N_U VCMPEQQ_M_F - VCVTAQ_M_S VCVTAQ_M_U VCVTQ_M_TO_F_S VCVTQ_M_TO_F_U - VQRSHRNBQ_N_U VQRSHRNBQ_N_S VQRSHRUNBQ_N_S - VRMLALDAVHAQ_S VABAVQ_S VABAVQ_U VSHLCQ_S VSHLCQ_U - VRMLALDAVHAQ_U VABSQ_M_S VADDVAQ_P_S VADDVAQ_P_U - VCLSQ_M_S VCLZQ_M_S VCLZQ_M_U VCMPCSQ_M_N_U - VCMPCSQ_M_U VCMPEQQ_M_N_S VCMPEQQ_M_N_U VCMPEQQ_M_S - VCMPEQQ_M_U VCMPGEQ_M_N_S VCMPGEQ_M_S VCMPGTQ_M_N_S - VCMPGTQ_M_S VCMPHIQ_M_N_U VCMPHIQ_M_U VCMPLEQ_M_N_S - VCMPLEQ_M_S VCMPLTQ_M_N_S VCMPLTQ_M_S VCMPNEQ_M_N_S - VCMPNEQ_M_N_U VCMPNEQ_M_S VCMPNEQ_M_U VDUPQ_M_N_S - VDUPQ_M_N_U VDWDUPQ_N_U VDWDUPQ_WB_U VIWDUPQ_N_U - VIWDUPQ_WB_U VMAXAQ_M_S VMAXAVQ_P_S VMAXVQ_P_S - VMAXVQ_P_U VMINAQ_M_S VMINAVQ_P_S VMINVQ_P_S VMINVQ_P_U - VMLADAVAQ_S VMLADAVAQ_U VMLADAVQ_P_S VMLADAVQ_P_U - VMLADAVXQ_P_S VMLAQ_N_S VMLAQ_N_U VMLASQ_N_S VMLASQ_N_U - VMLSDAVQ_P_S VMLSDAVXQ_P_S VMVNQ_M_S VMVNQ_M_U - VNEGQ_M_S VPSELQ_S VPSELQ_U VQABSQ_M_S VQDMLAHQ_N_S - VQDMLAHQ_N_U VQNEGQ_M_S VQRDMLADHQ_S VQRDMLADHXQ_S - VQRDMLAHQ_N_S VQRDMLAHQ_N_U VQRDMLASHQ_N_S - VQRDMLASHQ_N_U VQRDMLSDHQ_S VQRDMLSDHXQ_S VQRSHLQ_M_N_S - VQRSHLQ_M_N_U VQSHLQ_M_R_S VQSHLQ_M_R_U VREV64Q_M_S - VREV64Q_M_U VRSHLQ_M_N_S VRSHLQ_M_N_U VSHLQ_M_R_S - VSHLQ_M_R_U VSLIQ_N_S VSLIQ_N_U VSRIQ_N_S VSRIQ_N_U - VQDMLSDHXQ_S VQDMLSDHQ_S VQDMLADHXQ_S VQDMLADHQ_S - VMLSDAVAXQ_S VMLSDAVAQ_S VMLADAVAXQ_S - VCMPGEQ_M_F VCMPGTQ_M_N_F VMLSLDAVQ_P_S VRMLALDAVHAXQ_S - VMLSLDAVXQ_P_S VFMAQ_F VMLSLDAVAQ_S VQSHRUNBQ_N_S - VQRSHRUNTQ_N_S VCMLAQ_F VMINNMAQ_M_F VFMASQ_N_F - VDUPQ_M_N_F VCMPGTQ_M_F VCMPLTQ_M_F VRMLSLDAVHQ_P_S - VQSHRUNTQ_N_S VABSQ_M_F VMAXNMAVQ_P_F VFMAQ_N_F - VRMLSLDAVHXQ_P_S VREV32Q_M_F VRMLSLDAVHAQ_S - VRMLSLDAVHAXQ_S VCMPLTQ_M_N_F VCMPNEQ_M_F VRNDAQ_M_F - VRNDPQ_M_F VADDLVAQ_P_S VQMOVUNBQ_M_S VCMPLEQ_M_F - VCMLAQ_ROT180_F VMLSLDAVAXQ_S VRNDXQ_M_F VFMSQ_F - VMINNMVQ_P_F VMAXNMVQ_P_F VPSELQ_F VCMLAQ_ROT90_F - VQMOVUNTQ_M_S VREV64Q_M_F VNEGQ_M_F VRNDMQ_M_F - VCMPLEQ_M_N_F VCMPGEQ_M_N_F VRNDNQ_M_F VMINNMAVQ_P_F - VCMPNEQ_M_N_F VRMLALDAVHQ_P_S VRMLALDAVHXQ_P_S - VCMPEQQ_M_N_F VCMLAQ_ROT270_F VMAXNMAQ_M_F VRNDQ_M_F - VMLALDAVQ_P_U VMLALDAVQ_P_S VQMOVNBQ_M_S VQMOVNBQ_M_U - VMOVLTQ_M_U VMOVLTQ_M_S VMOVNBQ_M_U VMOVNBQ_M_S - VRSHRNTQ_N_U VRSHRNTQ_N_S VORRQ_M_N_S VORRQ_M_N_U - VREV32Q_M_S VREV32Q_M_U VQRSHRNTQ_N_U VQRSHRNTQ_N_S - VMOVNTQ_M_U VMOVNTQ_M_S VMOVLBQ_M_U VMOVLBQ_M_S - VMLALDAVAQ_S VMLALDAVAQ_U VQSHRNBQ_N_U VQSHRNBQ_N_S - VSHRNBQ_N_U VSHRNBQ_N_S VRSHRNBQ_N_S VRSHRNBQ_N_U - VMLALDAVXQ_P_U VMLALDAVXQ_P_S VQMOVNTQ_M_U VQMOVNTQ_M_S - VMVNQ_M_N_U VMVNQ_M_N_S VQSHRNTQ_N_U VQSHRNTQ_N_S - VMLALDAVAXQ_S VMLALDAVAXQ_U VSHRNTQ_N_S VSHRNTQ_N_U - VCVTBQ_M_F16_F32 VCVTBQ_M_F32_F16 VCVTTQ_M_F16_F32 - VCVTTQ_M_F32_F16 VCVTMQ_M_S VCVTMQ_M_U VCVTNQ_M_S - VCVTPQ_M_S VCVTPQ_M_U VCVTQ_M_N_FROM_F_S VCVTNQ_M_U - VREV16Q_M_S VREV16Q_M_U VREV32Q_M VCVTQ_M_FROM_F_U - VCVTQ_M_FROM_F_S VRMLALDAVHQ_P_U VADDLVAQ_P_U - VCVTQ_M_N_FROM_F_U VQSHLUQ_M_N_S VABAVQ_P_S - VABAVQ_P_U VSHLQ_M_S VSHLQ_M_U VSRIQ_M_N_S - VSRIQ_M_N_U VSUBQ_M_U VSUBQ_M_S VCVTQ_M_N_TO_F_U - VCVTQ_M_N_TO_F_S VQADDQ_M_U VQADDQ_M_S - VRSHRQ_M_N_S VSUBQ_M_N_S VSUBQ_M_N_U VBRSRQ_M_N_S - VSUBQ_M_N_F VBICQ_M_F VHADDQ_M_U VBICQ_M_U VBICQ_M_S - VMULQ_M_N_U VHADDQ_M_S VORNQ_M_F VMLAQ_M_N_S VQSUBQ_M_U - VQSUBQ_M_S VMLAQ_M_N_U VQSUBQ_M_N_U VQSUBQ_M_N_S - VMULLTQ_INT_M_S VMULLTQ_INT_M_U VMULQ_M_N_S VMULQ_M_N_F - VMLASQ_M_N_U VMLASQ_M_N_S VMAXQ_M_U VQRDMLAHQ_M_N_U - VCADDQ_ROT270_M_F VCADDQ_ROT270_M_U VCADDQ_ROT270_M_S - VQRSHLQ_M_S VMULQ_M_F VRHADDQ_M_U VSHRQ_M_N_U - VRHADDQ_M_S VMULQ_M_S VMULQ_M_U VQRDMLASHQ_M_N_S - VRSHLQ_M_S VRSHLQ_M_U VRSHRQ_M_N_U VADDQ_M_N_F - VADDQ_M_N_S VADDQ_M_N_U VQRDMLASHQ_M_N_U VMAXQ_M_S - VQRDMLAHQ_M_N_S VORRQ_M_S VORRQ_M_U VORRQ_M_F - VQRSHLQ_M_U VRMULHQ_M_U VRMULHQ_M_S VMINQ_M_S VMINQ_M_U - VANDQ_M_F VANDQ_M_U VANDQ_M_S VHSUBQ_M_N_S VHSUBQ_M_N_U - VMULHQ_M_S VMULHQ_M_U VMULLBQ_INT_M_U - VMULLBQ_INT_M_S VCADDQ_ROT90_M_F - VSHRQ_M_N_S VADDQ_M_U VSLIQ_M_N_U - VQADDQ_M_N_S VBRSRQ_M_N_F VABDQ_M_F VBRSRQ_M_N_U - VEORQ_M_F VSHLQ_M_N_S VQDMLAHQ_M_N_U VQDMLAHQ_M_N_S - VSHLQ_M_N_U VMLADAVAQ_P_U VMLADAVAQ_P_S VSLIQ_M_N_S - VQSHLQ_M_U VQSHLQ_M_S VCADDQ_ROT90_M_U VCADDQ_ROT90_M_S - VORNQ_M_U VORNQ_M_S VQSHLQ_M_N_S VQSHLQ_M_N_U VADDQ_M_S - VHADDQ_M_N_S VADDQ_M_F VQADDQ_M_N_U VEORQ_M_S VEORQ_M_U - VHSUBQ_M_S VHSUBQ_M_U VHADDQ_M_N_U VHCADDQ_ROT90_M_S - VQRDMLSDHQ_M_S VQRDMLSDHXQ_M_S VQRDMLADHXQ_M_S - VQDMULHQ_M_S VMLADAVAXQ_P_S VQDMLADHXQ_M_S - VQRDMULHQ_M_S VMLSDAVAXQ_P_S VQDMULHQ_M_N_S - VHCADDQ_ROT270_M_S VQDMLSDHQ_M_S VQDMLSDHXQ_M_S - VMLSDAVAQ_P_S VQRDMLADHQ_M_S VQDMLADHQ_M_S - VMLALDAVAQ_P_U VMLALDAVAQ_P_S VMLALDAVAXQ_P_U - VQRSHRNBQ_M_N_U VQRSHRNBQ_M_N_S VQRSHRNTQ_M_N_S - VQSHRNBQ_M_N_U VQSHRNBQ_M_N_S VQSHRNTQ_M_N_S - VRSHRNBQ_M_N_U VRSHRNBQ_M_N_S VRSHRNTQ_M_N_U - VSHLLBQ_M_N_U VSHLLBQ_M_N_S VSHLLTQ_M_N_U VSHLLTQ_M_N_S - VSHRNBQ_M_N_S VSHRNBQ_M_N_U VSHRNTQ_M_N_S VSHRNTQ_M_N_U - VMLALDAVAXQ_P_S VQRSHRNTQ_M_N_U VQSHRNTQ_M_N_U - VRSHRNTQ_M_N_S VQRDMULHQ_M_N_S VRMLALDAVHAQ_P_S - VMLSLDAVAQ_P_S VMLSLDAVAXQ_P_S VMULLBQ_POLY_M_P - VMULLTQ_POLY_M_P VQDMULLBQ_M_N_S VQDMULLBQ_M_S - VQDMULLTQ_M_N_S VQDMULLTQ_M_S VQRSHRUNBQ_M_N_S - VQRSHRUNTQ_M_N_SVQSHRUNBQ_M_N_S VQSHRUNTQ_M_N_S - VRMLALDAVHAQ_P_U VRMLALDAVHAXQ_P_S VRMLSLDAVHAQ_P_S - VRMLSLDAVHAXQ_P_S VQRSHRUNTQ_M_N_S VQSHRUNBQ_M_N_S - VCMLAQ_M_F VCMLAQ_ROT180_M_F VCMLAQ_ROT270_M_F - VCMLAQ_ROT90_M_F VCMULQ_M_F VCMULQ_ROT180_M_F - VCMULQ_ROT270_M_F VCMULQ_ROT90_M_F VFMAQ_M_F - VFMAQ_M_N_F VFMASQ_M_N_F VFMSQ_M_F VMAXNMQ_M_F - VMINNMQ_M_F VSUBQ_M_F VSTRWQSB_S VSTRWQSB_U - VSTRBQSO_S VSTRBQSO_U VSTRBQ_S VSTRBQ_U VLDRBQGO_S - VLDRBQGO_U VLDRBQ_S VLDRBQ_U VLDRWQGB_S VLDRWQGB_U - VLD1Q_F VLD1Q_S VLD1Q_U VLDRHQ_F VLDRHQGO_S - VLDRHQGO_U VLDRHQGSO_S VLDRHQGSO_U VLDRHQ_S VLDRHQ_U - VLDRWQ_F VLDRWQ_S VLDRWQ_U VLDRDQGB_S VLDRDQGB_U - VLDRDQGO_S VLDRDQGO_U VLDRDQGSO_S VLDRDQGSO_U - VLDRHQGO_F VLDRHQGSO_F VLDRWQGB_F VLDRWQGO_F - VLDRWQGO_S VLDRWQGO_U VLDRWQGSO_F VLDRWQGSO_S - VLDRWQGSO_U VSTRHQ_F VST1Q_S VST1Q_U VSTRHQSO_S - VSTRHQSO_U VSTRHQSSO_S VSTRHQSSO_U VSTRHQ_S - VSTRHQ_U VSTRWQ_S VSTRWQ_U VSTRWQ_F VST1Q_F VSTRDQSB_S - VSTRDQSB_U VSTRDQSO_S VSTRDQSO_U VSTRDQSSO_S - VSTRDQSSO_U VSTRWQSO_S VSTRWQSO_U VSTRWQSSO_S - VSTRWQSSO_U VSTRHQSO_F VSTRHQSSO_F VSTRWQSB_F - VSTRWQSO_F VSTRWQSSO_F VDDUPQ VDDUPQ_M VDWDUPQ - VDWDUPQ_M VIDUPQ VIDUPQ_M VIWDUPQ VIWDUPQ_M - VSTRWQSBWB_S VSTRWQSBWB_U VLDRWQGBWB_S VLDRWQGBWB_U - VSTRWQSBWB_F VLDRWQGBWB_F VSTRDQSBWB_S VSTRDQSBWB_U - VLDRDQGBWB_S VLDRDQGBWB_U VADCQ_U VADCQ_M_U VADCQ_S - VADCQ_M_S VSBCIQ_U VSBCIQ_S VSBCIQ_M_U VSBCIQ_M_S - VSBCQ_U VSBCQ_S VSBCQ_M_U VSBCQ_M_S VADCIQ_U VADCIQ_M_U - VADCIQ_S VADCIQ_M_S VLD2Q VLD4Q VST2Q SRSHRL SRSHR - URSHR URSHRL SQRSHR UQRSHL UQRSHLL_64 VSHLCQ_M_U - UQRSHLL_48 SQRSHRL_64 SQRSHRL_48 VSHLCQ_M_S]) - -(define_mode_attr MVE_CNVT [(V8HI "V8HF") (V4SI "V4SF") (V8HF "V8HI") - (V4SF "V4SI")]) - -(define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s") - (VREV16Q_U "u") (VMVNQ_N_S "s") (VMVNQ_N_U "u") - (VCVTAQ_U "u") (VCVTAQ_S "s") (VREV64Q_S "s") - (VREV64Q_U "u") (VMVNQ_S "s") (VMVNQ_U "u") - (VDUPQ_N_U "u") (VDUPQ_N_S"s") (VADDVQ_S "s") - (VADDVQ_U "u") (VADDVQ_S "s") (VADDVQ_U "u") - (VMOVLTQ_U "u") (VMOVLTQ_S "s") (VMOVLBQ_S "s") - (VMOVLBQ_U "u") (VCVTQ_FROM_F_S "s") (VCVTQ_FROM_F_U "u") - (VCVTPQ_S "s") (VCVTPQ_U "u") (VCVTNQ_S "s") - (VCVTNQ_U "u") (VCVTMQ_S "s") (VCVTMQ_U "u") - (VCLZQ_U "u") (VCLZQ_S "s") (VREV32Q_U "u") - (VREV32Q_S "s") (VADDLVQ_U "u") (VADDLVQ_S "s") - (VCVTQ_N_TO_F_S "s") (VCVTQ_N_TO_F_U "u") - (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s") - (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u") - (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s") - (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s") - (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s") - (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u") - (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VANDQ_S "s") - (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u") - (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s") - (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s") - (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u") - (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s") - (VCMPNEQ_N_U "u") (VEORQ_S "s") (VEORQ_U "u") - (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s") - (VHADDQ_U "u") (VHSUBQ_N_S "s") (VHSUBQ_N_U "u") - (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u") - (VMAXVQ_S "s") (VMAXVQ_U "u") (VMINQ_S "s") (VMINQ_U "u") - (VMINVQ_S "s") (VMINVQ_U "u") (VMLADAVQ_S "s") - (VMLADAVQ_U "u") (VMULHQ_S "s") (VMULHQ_U "u") - (VMULLBQ_INT_S "s") (VMULLBQ_INT_U "u") (VQADDQ_S "s") - (VMULLTQ_INT_S "s") (VMULLTQ_INT_U "u") (VQADDQ_U "u") - (VMULQ_N_S "s") (VMULQ_N_U "u") (VMULQ_S "s") - (VMULQ_U "u") (VORNQ_S "s") (VORNQ_U "u") (VORRQ_S "s") - (VORRQ_U "u") (VQADDQ_N_S "s") (VQADDQ_N_U "u") - (VQRSHLQ_N_S "s") (VQRSHLQ_N_U "u") (VQRSHLQ_S "s") - (VQRSHLQ_U "u") (VQSHLQ_N_S "s") (VQSHLQ_N_U "u") - (VQSHLQ_R_S "s") (VQSHLQ_R_U "u") (VQSHLQ_S "s") - (VQSHLQ_U "u") (VQSUBQ_N_S "s") (VQSUBQ_N_U "u") - (VQSUBQ_S "s") (VQSUBQ_U "u") (VRHADDQ_S "s") - (VRHADDQ_U "u") (VRMULHQ_S "s") (VRMULHQ_U "u") - (VRSHLQ_N_S "s") (VRSHLQ_N_U "u") (VRSHLQ_S "s") - (VRSHLQ_U "u") (VRSHRQ_N_S "s") (VRSHRQ_N_U "u") - (VSHLQ_N_S "s") (VSHLQ_N_U "u") (VSHLQ_R_S "s") - (VSHLQ_R_U "u") (VSUBQ_N_S "s") (VSUBQ_N_U "u") - (VSUBQ_S "s") (VSUBQ_U "u") (VADDVAQ_S "s") - (VADDVAQ_U "u") (VADDLVAQ_S "s") (VADDLVAQ_U "u") - (VBICQ_N_S "s") (VBICQ_N_U "u") (VMLALDAVQ_U "u") - (VMLALDAVQ_S "s") (VMLALDAVXQ_U "u") (VMLALDAVXQ_S "s") - (VMOVNBQ_U "u") (VMOVNBQ_S "s") (VMOVNTQ_U "u") - (VMOVNTQ_S "s") (VORRQ_N_S "s") (VORRQ_N_U "u") - (VQMOVNBQ_U "u") (VQMOVNBQ_S "s") (VQMOVNTQ_S "s") - (VQMOVNTQ_U "u") (VSHLLBQ_N_U "u") (VSHLLBQ_N_S "s") - (VSHLLTQ_N_U "u") (VSHLLTQ_N_S "s") (VRMLALDAVHQ_U "u") - (VRMLALDAVHQ_S "s") (VBICQ_M_N_S "s") (VBICQ_M_N_U "u") - (VCVTAQ_M_S "s") (VCVTAQ_M_U "u") (VCVTQ_M_TO_F_S "s") - (VCVTQ_M_TO_F_U "u") (VQRSHRNBQ_N_S "s") - (VQRSHRNBQ_N_U "u") (VABAVQ_S "s") (VABAVQ_U "u") - (VRMLALDAVHAQ_U "u") (VRMLALDAVHAQ_S "s") (VSHLCQ_S "s") - (VSHLCQ_U "u") (VADDVAQ_P_S "s") (VADDVAQ_P_U "u") - (VCLZQ_M_S "s") (VCLZQ_M_U "u") (VCMPEQQ_M_N_S "s") - (VCMPEQQ_M_N_U "u") (VCMPEQQ_M_S "s") (VCMPEQQ_M_U "u") - (VCMPNEQ_M_N_S "s") (VCMPNEQ_M_N_U "u") (VCMPNEQ_M_S "s") - (VCMPNEQ_M_U "u") (VDUPQ_M_N_S "s") (VDUPQ_M_N_U "u") - (VMAXVQ_P_S "s") (VMAXVQ_P_U "u") (VMINVQ_P_S "s") - (VMINVQ_P_U "u") (VMLADAVAQ_S "s") (VMLADAVAQ_U "u") - (VMLADAVQ_P_S "s") (VMLADAVQ_P_U "u") (VMLAQ_N_S "s") - (VMLAQ_N_U "u") (VMLASQ_N_S "s") (VMLASQ_N_U "u") - (VMVNQ_M_S "s") (VMVNQ_M_U "u") (VPSELQ_S "s") - (VPSELQ_U "u") (VQDMLAHQ_N_S "s") (VQDMLAHQ_N_U "u") - (VQRDMLAHQ_N_S "s") (VQRDMLAHQ_N_U "u") - (VQRDMLASHQ_N_S "s") (VQRDMLASHQ_N_U "u") - (VQRSHLQ_M_N_S "s") (VQRSHLQ_M_N_U "u") - (VQSHLQ_M_R_S "s") (VQSHLQ_M_R_U "u") (VSRIQ_N_S "s") - (VREV64Q_M_S "s") (VREV64Q_M_U "u") (VSRIQ_N_U "u") - (VRSHLQ_M_N_S "s") (VRSHLQ_M_N_U "u") (VSHLQ_M_R_S "s") - (VSHLQ_M_R_U "u") (VSLIQ_N_S "s") (VSLIQ_N_U "u") - (VMLALDAVQ_P_S "s") (VQMOVNBQ_M_S "s") (VMOVLTQ_M_S "s") - (VMOVNBQ_M_S "s") (VRSHRNTQ_N_S "s") (VORRQ_M_N_S "s") - (VREV32Q_M_S "s") (VQRSHRNTQ_N_S "s") (VMOVNTQ_M_S "s") - (VMOVLBQ_M_S "s") (VMLALDAVAQ_S "s") (VQSHRNBQ_N_S "s") - (VSHRNBQ_N_S "s") (VRSHRNBQ_N_S "s") (VMLALDAVXQ_P_S "s") - (VQMOVNTQ_M_S "s") (VMVNQ_M_N_S "s") (VQSHRNTQ_N_S "s") - (VMLALDAVAXQ_S "s") (VSHRNTQ_N_S "s") (VMLALDAVQ_P_U "u") - (VQMOVNBQ_M_U "u") (VMOVLTQ_M_U "u") (VMOVNBQ_M_U "u") - (VRSHRNTQ_N_U "u") (VORRQ_M_N_U "u") (VREV32Q_M_U "u") - (VREV16Q_M_S "s") (VREV16Q_M_U "u") - (VQRSHRNTQ_N_U "u") (VMOVNTQ_M_U "u") (VMOVLBQ_M_U "u") - (VMLALDAVAQ_U "u") (VQSHRNBQ_N_U "u") (VSHRNBQ_N_U "u") - (VRSHRNBQ_N_U "u") (VMLALDAVXQ_P_U "u") - (VMVNQ_M_N_U "u") (VQSHRNTQ_N_U "u") (VMLALDAVAXQ_U "u") - (VQMOVNTQ_M_U "u") (VSHRNTQ_N_U "u") (VCVTMQ_M_S "s") - (VCVTMQ_M_U "u") (VCVTNQ_M_S "s") (VCVTNQ_M_U "u") - (VCVTPQ_M_S "s") (VCVTPQ_M_U "u") (VADDLVAQ_P_S "s") - (VCVTQ_M_N_FROM_F_U "u") (VCVTQ_M_FROM_F_S "s") - (VCVTQ_M_FROM_F_U "u") (VRMLALDAVHQ_P_U "u") - (VRMLALDAVHQ_P_S "s") (VADDLVAQ_P_U "u") - (VCVTQ_M_N_FROM_F_S "s") (VABAVQ_P_U "u") - (VABAVQ_P_S "s") (VSHLQ_M_S "s") (VSHLQ_M_U "u") - (VSRIQ_M_N_S "s") (VSRIQ_M_N_U "u") (VSUBQ_M_S "s") - (VSUBQ_M_U "u") (VCVTQ_M_N_TO_F_S "s") - (VCVTQ_M_N_TO_F_U "u") (VADDQ_M_N_U "u") - (VSHLQ_M_N_S "s") (VMAXQ_M_U "u") (VHSUBQ_M_N_U "u") - (VMULQ_M_N_S "s") (VQSHLQ_M_U "u") (VRHADDQ_M_S "s") - (VEORQ_M_U "u") (VSHRQ_M_N_U "u") (VCADDQ_ROT90_M_U "u") - (VMLADAVAQ_P_U "u") (VEORQ_M_S "s") (VBRSRQ_M_N_S "s") - (VMULQ_M_U "u") (VQRDMLAHQ_M_N_S "s") (VHSUBQ_M_N_S "s") - (VQRSHLQ_M_S "s") (VMULQ_M_N_U "u") - (VMULQ_M_S "s") (VQSHLQ_M_N_U "u") (VSLIQ_M_N_U "u") - (VMLADAVAQ_P_S "s") (VQRSHLQ_M_U "u") - (VMULLBQ_INT_M_U "u") (VSHLQ_M_N_U "u") (VQSUBQ_M_U "u") - (VQRDMLASHQ_M_N_U "u") (VRSHRQ_M_N_S "s") - (VORNQ_M_S "s") (VCADDQ_ROT270_M_S "s") (VRHADDQ_M_U "u") - (VRSHRQ_M_N_U "u") (VMLASQ_M_N_U "u") (VHSUBQ_M_U "u") - (VQSUBQ_M_N_S "s") (VMULLTQ_INT_M_S "s") - (VORRQ_M_S "s") (VQDMLAHQ_M_N_U "u") (VRSHLQ_M_S "s") - (VHADDQ_M_U "u") (VHADDQ_M_N_S "s") (VMULLTQ_INT_M_U "u") - (VORRQ_M_U "u") (VHADDQ_M_S "s") (VHADDQ_M_N_U "u") - (VQDMLAHQ_M_N_S "s") (VMAXQ_M_S "s") (VORNQ_M_U "u") - (VCADDQ_ROT270_M_U "u") (VQADDQ_M_U "u") - (VQRDMLASHQ_M_N_S "s") (VBICQ_M_U "u") (VMINQ_M_U "u") - (VSUBQ_M_N_S "s") (VMULLBQ_INT_M_S "s") (VQSUBQ_M_S "s") - (VCADDQ_ROT90_M_S "s") (VRMULHQ_M_S "s") (VANDQ_M_U "u") - (VMULHQ_M_S "s") (VADDQ_M_S "s") (VQRDMLAHQ_M_N_U "u") - (VMLASQ_M_N_S "s") (VHSUBQ_M_S "s") (VRMULHQ_M_U "u") - (VQADDQ_M_N_S "s") (VSHRQ_M_N_S "s") (VANDQ_M_S "s") - (VABDQ_M_U "u") (VQSHLQ_M_S "s") (VABDQ_M_S "s") - (VSUBQ_M_N_U "u") (VMLAQ_M_N_S "s") (VBRSRQ_M_N_U "u") - (VADDQ_M_U "u") (VRSHLQ_M_U "u") (VSLIQ_M_N_S "s") - (VQADDQ_M_N_U "u") (VADDQ_M_N_S "s") (VQSUBQ_M_N_U "u") - (VMLAQ_M_N_U "u") (VMINQ_M_S "s") (VMULHQ_M_U "u") - (VQADDQ_M_S "s") (VBICQ_M_S "s") (VQSHLQ_M_N_S "s") - (VQSHRNTQ_M_N_S "s") (VQSHRNTQ_M_N_U "u") - (VSHRNTQ_M_N_U "u") (VSHRNTQ_M_N_S "s") - (VSHRNBQ_M_N_S "s") (VSHRNBQ_M_N_U "u") - (VSHLLTQ_M_N_S "s") (VSHLLTQ_M_N_U "u") - (VSHLLBQ_M_N_S "s") (VSHLLBQ_M_N_U "u") - (VRSHRNTQ_M_N_S "s") (VRSHRNTQ_M_N_U "u") - (VRSHRNBQ_M_N_U "u") (VRSHRNBQ_M_N_S "s") - (VQSHRNTQ_M_N_U "u") (VQSHRNTQ_M_N_S "s") - (VQSHRNBQ_M_N_S "s") (VQSHRNBQ_M_N_U "u") - (VQRSHRNTQ_M_N_S "s") (VQRSHRNTQ_M_N_U "u") - (VQRSHRNBQ_M_N_S "s") (VQRSHRNBQ_M_N_U "u") - (VMLALDAVAXQ_P_S "s") (VMLALDAVAXQ_P_U "u") - (VMLALDAVAQ_P_S "s") (VMLALDAVAQ_P_U "u") - (VSTRWQSB_S "s") (VSTRWQSB_U "u") (VSTRBQSO_S "s") - (VSTRBQSO_U "u") (VSTRBQ_S "s") (VSTRBQ_U "u") - (VLDRBQGO_S "s") (VLDRBQGO_U "u") (VLDRBQ_S "s") - (VLDRBQ_U "u") (VLDRWQGB_S "s") (VLDRWQGB_U "u") - (VLD1Q_S "s") (VLD1Q_U "u") (VLDRHQGO_S "s") - (VLDRHQGO_U "u") (VLDRHQGSO_S "s") (VLDRHQGSO_U "u") - (VLDRHQ_S "s") (VLDRHQ_U "u") (VLDRWQ_S "s") - (VLDRWQ_U "u") (VLDRDQGB_S "s") (VLDRDQGB_U "u") - (VLDRDQGO_S "s") (VLDRDQGO_U "u") (VLDRDQGSO_S "s") - (VLDRDQGSO_U "u") (VLDRWQGO_S "s") (VLDRWQGO_U "u") - (VLDRWQGSO_S "s") (VLDRWQGSO_U "u") (VST1Q_S "s") - (VST1Q_U "u") (VSTRHQSO_S "s") (VSTRHQSO_U "u") - (VSTRHQSSO_S "s") (VSTRHQSSO_U "u") (VSTRHQ_S "s") - (VSTRHQ_U "u") (VSTRWQ_S "s") (VSTRWQ_U "u") - (VSTRDQSB_S "s") (VSTRDQSB_U "u") (VSTRDQSO_S "s") - (VSTRDQSO_U "u") (VSTRDQSSO_S "s") (VSTRDQSSO_U "u") - (VSTRWQSO_U "u") (VSTRWQSO_S "s") (VSTRWQSSO_U "u") - (VSTRWQSSO_S "s") (VSTRWQSBWB_S "s") (VSTRWQSBWB_U "u") - (VLDRWQGBWB_S "s") (VLDRWQGBWB_U "u") (VLDRDQGBWB_S "s") - (VLDRDQGBWB_U "u") (VSTRDQSBWB_S "s") (VADCQ_M_S "s") - (VSTRDQSBWB_U "u") (VSBCQ_U "u") (VSBCQ_M_U "u") - (VSBCQ_S "s") (VSBCQ_M_S "s") (VSBCIQ_U "u") - (VSBCIQ_M_U "u") (VSBCIQ_S "s") (VSBCIQ_M_S "s") - (VADCQ_U "u") (VADCQ_M_U "u") (VADCQ_S "s") - (VADCIQ_U "u") (VADCIQ_M_U "u") (VADCIQ_S "s") - (VADCIQ_M_S "s") (SQRSHRL_64 "64") (SQRSHRL_48 "48") - (UQRSHLL_64 "64") (UQRSHLL_48 "48") (VSHLCQ_M_S "s") - (VSHLCQ_M_U "u")]) - -(define_int_attr mode1 [(VCTP8Q "8") (VCTP16Q "16") (VCTP32Q "32") - (VCTP64Q "64") (VCTP8Q_M "8") (VCTP16Q_M "16") - (VCTP32Q_M "32") (VCTP64Q_M "64")]) -(define_mode_attr MVE_pred2 [(V16QI "mve_imm_8") (V8HI "mve_imm_16") - (V4SI "mve_imm_32") - (V8HF "mve_imm_16") (V4SF "mve_imm_32")]) -(define_mode_attr MVE_constraint2 [(V16QI "Rb") (V8HI "Rd") (V4SI "Rf") - (V8HF "Rd") (V4SF "Rf")]) -(define_mode_attr MVE_LANES [(V16QI "16") (V8HI "8") (V4SI "4")]) -(define_mode_attr MVE_constraint [ (V16QI "Ra") (V8HI "Rc") (V4SI "Re")]) -(define_mode_attr MVE_pred [ (V16QI "mve_imm_7") (V8HI "mve_imm_15") - (V4SI "mve_imm_31")]) -(define_mode_attr MVE_constraint3 [ (V8HI "Rb") (V4SI "Rd")]) -(define_mode_attr MVE_pred3 [ (V8HI "mve_imm_8") (V4SI "mve_imm_16")]) -(define_mode_attr MVE_constraint1 [ (V8HI "Ra") (V4SI "Rc")]) -(define_mode_attr MVE_pred1 [ (V8HI "mve_imm_7") (V4SI "mve_imm_15")]) -(define_mode_attr MVE_B_ELEM [ (V16QI "V16QI") (V8HI "V8QI") (V4SI "V4QI")]) -(define_mode_attr MVE_H_ELEM [ (V8HI "V8HI") (V4SI "V4HI")]) -(define_mode_attr V_sz_elem1 [(V16QI "b") (V8HI "h") (V4SI "w") (V8HF "h") - (V4SF "w")]) -(define_mode_attr V_extr_elem [(V16QI "u8") (V8HI "u16") (V4SI "32") - (V8HF "u16") (V4SF "32")]) - -(define_mode_attr earlyclobber_32 [(V16QI "=w") (V8HI "=w") (V4SI "=&w") - (V8HF "=w") (V4SF "=&w")]) - -(define_int_iterator VCVTQ_TO_F [VCVTQ_TO_F_S VCVTQ_TO_F_U]) -(define_int_iterator VMVNQ_N [VMVNQ_N_U VMVNQ_N_S]) -(define_int_iterator VREV64Q [VREV64Q_S VREV64Q_U]) -(define_int_iterator VCVTQ_FROM_F [VCVTQ_FROM_F_S VCVTQ_FROM_F_U]) -(define_int_iterator VREV16Q [VREV16Q_U VREV16Q_S]) -(define_int_iterator VCVTAQ [VCVTAQ_U VCVTAQ_S]) -(define_int_iterator VMVNQ [VMVNQ_U VMVNQ_S]) -(define_int_iterator VDUPQ_N [VDUPQ_N_U VDUPQ_N_S]) -(define_int_iterator VCLZQ [VCLZQ_U VCLZQ_S]) -(define_int_iterator VADDVQ [VADDVQ_U VADDVQ_S]) -(define_int_iterator VREV32Q [VREV32Q_U VREV32Q_S]) -(define_int_iterator VMOVLBQ [VMOVLBQ_S VMOVLBQ_U]) -(define_int_iterator VMOVLTQ [VMOVLTQ_U VMOVLTQ_S]) -(define_int_iterator VCVTPQ [VCVTPQ_S VCVTPQ_U]) -(define_int_iterator VCVTNQ [VCVTNQ_S VCVTNQ_U]) -(define_int_iterator VCVTMQ [VCVTMQ_S VCVTMQ_U]) -(define_int_iterator VADDLVQ [VADDLVQ_U VADDLVQ_S]) -(define_int_iterator VCTPQ [VCTP8Q VCTP16Q VCTP32Q VCTP64Q]) -(define_int_iterator VCTPQ_M [VCTP8Q_M VCTP16Q_M VCTP32Q_M VCTP64Q_M]) -(define_int_iterator VCVTQ_N_TO_F [VCVTQ_N_TO_F_S VCVTQ_N_TO_F_U]) -(define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S]) -(define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U]) -(define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U]) -(define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U]) -(define_int_iterator VCMPNEQ [VCMPNEQ_U VCMPNEQ_S]) -(define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U]) -(define_int_iterator VABDQ [VABDQ_S VABDQ_U]) -(define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U]) -(define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U]) -(define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S]) -(define_int_iterator VANDQ [VANDQ_U VANDQ_S]) -(define_int_iterator VBICQ [VBICQ_S VBICQ_U]) -(define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S]) -(define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U]) -(define_int_iterator VCADDQ_ROT90 [VCADDQ_ROT90_U VCADDQ_ROT90_S]) -(define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S]) -(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U]) -(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S]) -(define_int_iterator VEORQ [VEORQ_U VEORQ_S]) -(define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U]) -(define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S]) -(define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U]) -(define_int_iterator VHSUBQ_N [VHSUBQ_N_U VHSUBQ_N_S]) -(define_int_iterator VMAXQ [VMAXQ_U VMAXQ_S]) -(define_int_iterator VMAXVQ [VMAXVQ_U VMAXVQ_S]) -(define_int_iterator VMINQ [VMINQ_S VMINQ_U]) -(define_int_iterator VMINVQ [VMINVQ_U VMINVQ_S]) -(define_int_iterator VMLADAVQ [VMLADAVQ_U VMLADAVQ_S]) -(define_int_iterator VMULHQ [VMULHQ_S VMULHQ_U]) -(define_int_iterator VMULLBQ_INT [VMULLBQ_INT_U VMULLBQ_INT_S]) -(define_int_iterator VMULLTQ_INT [VMULLTQ_INT_U VMULLTQ_INT_S]) -(define_int_iterator VMULQ [VMULQ_U VMULQ_S]) -(define_int_iterator VMULQ_N [VMULQ_N_U VMULQ_N_S]) -(define_int_iterator VORNQ [VORNQ_U VORNQ_S]) -(define_int_iterator VORRQ [VORRQ_S VORRQ_U]) -(define_int_iterator VQADDQ [VQADDQ_U VQADDQ_S]) -(define_int_iterator VQADDQ_N [VQADDQ_N_S VQADDQ_N_U]) -(define_int_iterator VQRSHLQ [VQRSHLQ_S VQRSHLQ_U]) -(define_int_iterator VQRSHLQ_N [VQRSHLQ_N_S VQRSHLQ_N_U]) -(define_int_iterator VQSHLQ [VQSHLQ_S VQSHLQ_U]) -(define_int_iterator VQSHLQ_N [VQSHLQ_N_S VQSHLQ_N_U]) -(define_int_iterator VQSHLQ_R [VQSHLQ_R_U VQSHLQ_R_S]) -(define_int_iterator VQSUBQ [VQSUBQ_U VQSUBQ_S]) -(define_int_iterator VQSUBQ_N [VQSUBQ_N_S VQSUBQ_N_U]) -(define_int_iterator VRHADDQ [VRHADDQ_S VRHADDQ_U]) -(define_int_iterator VRMULHQ [VRMULHQ_S VRMULHQ_U]) -(define_int_iterator VRSHLQ [VRSHLQ_S VRSHLQ_U]) -(define_int_iterator VRSHLQ_N [VRSHLQ_N_U VRSHLQ_N_S]) -(define_int_iterator VRSHRQ_N [VRSHRQ_N_S VRSHRQ_N_U]) -(define_int_iterator VSHLQ_N [VSHLQ_N_U VSHLQ_N_S]) -(define_int_iterator VSHLQ_R [VSHLQ_R_S VSHLQ_R_U]) -(define_int_iterator VSUBQ [VSUBQ_S VSUBQ_U]) -(define_int_iterator VSUBQ_N [VSUBQ_N_S VSUBQ_N_U]) -(define_int_iterator VADDLVAQ [VADDLVAQ_S VADDLVAQ_U]) -(define_int_iterator VBICQ_N [VBICQ_N_S VBICQ_N_U]) -(define_int_iterator VMLALDAVQ [VMLALDAVQ_U VMLALDAVQ_S]) -(define_int_iterator VMLALDAVXQ [VMLALDAVXQ_U VMLALDAVXQ_S]) -(define_int_iterator VMOVNBQ [VMOVNBQ_U VMOVNBQ_S]) -(define_int_iterator VMOVNTQ [VMOVNTQ_S VMOVNTQ_U]) -(define_int_iterator VORRQ_N [VORRQ_N_U VORRQ_N_S]) -(define_int_iterator VQMOVNBQ [VQMOVNBQ_U VQMOVNBQ_S]) -(define_int_iterator VQMOVNTQ [VQMOVNTQ_U VQMOVNTQ_S]) -(define_int_iterator VSHLLBQ_N [VSHLLBQ_N_S VSHLLBQ_N_U]) -(define_int_iterator VSHLLTQ_N [VSHLLTQ_N_U VSHLLTQ_N_S]) -(define_int_iterator VRMLALDAVHQ [VRMLALDAVHQ_U VRMLALDAVHQ_S]) -(define_int_iterator VBICQ_M_N [VBICQ_M_N_S VBICQ_M_N_U]) -(define_int_iterator VCVTAQ_M [VCVTAQ_M_S VCVTAQ_M_U]) -(define_int_iterator VCVTQ_M_TO_F [VCVTQ_M_TO_F_S VCVTQ_M_TO_F_U]) -(define_int_iterator VQRSHRNBQ_N [VQRSHRNBQ_N_U VQRSHRNBQ_N_S]) -(define_int_iterator VABAVQ [VABAVQ_S VABAVQ_U]) -(define_int_iterator VSHLCQ [VSHLCQ_S VSHLCQ_U]) -(define_int_iterator VRMLALDAVHAQ [VRMLALDAVHAQ_S VRMLALDAVHAQ_U]) -(define_int_iterator VADDVAQ_P [VADDVAQ_P_S VADDVAQ_P_U]) -(define_int_iterator VCLZQ_M [VCLZQ_M_S VCLZQ_M_U]) -(define_int_iterator VCMPEQQ_M_N [VCMPEQQ_M_N_S VCMPEQQ_M_N_U]) -(define_int_iterator VCMPEQQ_M [VCMPEQQ_M_S VCMPEQQ_M_U]) -(define_int_iterator VCMPNEQ_M_N [VCMPNEQ_M_N_S VCMPNEQ_M_N_U]) -(define_int_iterator VCMPNEQ_M [VCMPNEQ_M_S VCMPNEQ_M_U]) -(define_int_iterator VDUPQ_M_N [VDUPQ_M_N_S VDUPQ_M_N_U]) -(define_int_iterator VMAXVQ_P [VMAXVQ_P_S VMAXVQ_P_U]) -(define_int_iterator VMINVQ_P [VMINVQ_P_S VMINVQ_P_U]) -(define_int_iterator VMLADAVAQ [VMLADAVAQ_S VMLADAVAQ_U]) -(define_int_iterator VMLADAVQ_P [VMLADAVQ_P_S VMLADAVQ_P_U]) -(define_int_iterator VMLAQ_N [VMLAQ_N_S VMLAQ_N_U]) -(define_int_iterator VMLASQ_N [VMLASQ_N_S VMLASQ_N_U]) -(define_int_iterator VMVNQ_M [VMVNQ_M_S VMVNQ_M_U]) -(define_int_iterator VPSELQ [VPSELQ_S VPSELQ_U]) -(define_int_iterator VQDMLAHQ_N [VQDMLAHQ_N_S VQDMLAHQ_N_U]) -(define_int_iterator VQRDMLAHQ_N [VQRDMLAHQ_N_S VQRDMLAHQ_N_U]) -(define_int_iterator VQRDMLASHQ_N [VQRDMLASHQ_N_S VQRDMLASHQ_N_U]) -(define_int_iterator VQRSHLQ_M_N [VQRSHLQ_M_N_S VQRSHLQ_M_N_U]) -(define_int_iterator VQSHLQ_M_R [VQSHLQ_M_R_S VQSHLQ_M_R_U]) -(define_int_iterator VREV64Q_M [VREV64Q_M_S VREV64Q_M_U]) -(define_int_iterator VRSHLQ_M_N [VRSHLQ_M_N_S VRSHLQ_M_N_U]) -(define_int_iterator VSHLQ_M_R [VSHLQ_M_R_S VSHLQ_M_R_U]) -(define_int_iterator VSLIQ_N [VSLIQ_N_S VSLIQ_N_U]) -(define_int_iterator VSRIQ_N [VSRIQ_N_S VSRIQ_N_U]) -(define_int_iterator VMLALDAVQ_P [VMLALDAVQ_P_U VMLALDAVQ_P_S]) -(define_int_iterator VQMOVNBQ_M [VQMOVNBQ_M_S VQMOVNBQ_M_U]) -(define_int_iterator VMOVLTQ_M [VMOVLTQ_M_U VMOVLTQ_M_S]) -(define_int_iterator VMOVNBQ_M [VMOVNBQ_M_U VMOVNBQ_M_S]) -(define_int_iterator VRSHRNTQ_N [VRSHRNTQ_N_U VRSHRNTQ_N_S]) -(define_int_iterator VORRQ_M_N [VORRQ_M_N_S VORRQ_M_N_U]) -(define_int_iterator VREV32Q_M [VREV32Q_M_S VREV32Q_M_U]) -(define_int_iterator VREV16Q_M [VREV16Q_M_S VREV16Q_M_U]) -(define_int_iterator VQRSHRNTQ_N [VQRSHRNTQ_N_U VQRSHRNTQ_N_S]) -(define_int_iterator VMOVNTQ_M [VMOVNTQ_M_U VMOVNTQ_M_S]) -(define_int_iterator VMOVLBQ_M [VMOVLBQ_M_U VMOVLBQ_M_S]) -(define_int_iterator VMLALDAVAQ [VMLALDAVAQ_S VMLALDAVAQ_U]) -(define_int_iterator VQSHRNBQ_N [VQSHRNBQ_N_U VQSHRNBQ_N_S]) -(define_int_iterator VSHRNBQ_N [VSHRNBQ_N_U VSHRNBQ_N_S]) -(define_int_iterator VRSHRNBQ_N [VRSHRNBQ_N_S VRSHRNBQ_N_U]) -(define_int_iterator VMLALDAVXQ_P [VMLALDAVXQ_P_U VMLALDAVXQ_P_S]) -(define_int_iterator VQMOVNTQ_M [VQMOVNTQ_M_U VQMOVNTQ_M_S]) -(define_int_iterator VMVNQ_M_N [VMVNQ_M_N_U VMVNQ_M_N_S]) -(define_int_iterator VQSHRNTQ_N [VQSHRNTQ_N_U VQSHRNTQ_N_S]) -(define_int_iterator VMLALDAVAXQ [VMLALDAVAXQ_S VMLALDAVAXQ_U]) -(define_int_iterator VSHRNTQ_N [VSHRNTQ_N_S VSHRNTQ_N_U]) -(define_int_iterator VCVTMQ_M [VCVTMQ_M_S VCVTMQ_M_U]) -(define_int_iterator VCVTNQ_M [VCVTNQ_M_S VCVTNQ_M_U]) -(define_int_iterator VCVTPQ_M [VCVTPQ_M_S VCVTPQ_M_U]) -(define_int_iterator VCVTQ_M_N_FROM_F [VCVTQ_M_N_FROM_F_S VCVTQ_M_N_FROM_F_U]) -(define_int_iterator VCVTQ_M_FROM_F [VCVTQ_M_FROM_F_U VCVTQ_M_FROM_F_S]) -(define_int_iterator VRMLALDAVHQ_P [VRMLALDAVHQ_P_S VRMLALDAVHQ_P_U]) -(define_int_iterator VADDLVAQ_P [VADDLVAQ_P_U VADDLVAQ_P_S]) -(define_int_iterator VABAVQ_P [VABAVQ_P_S VABAVQ_P_U]) -(define_int_iterator VSHLQ_M [VSHLQ_M_S VSHLQ_M_U]) -(define_int_iterator VSRIQ_M_N [VSRIQ_M_N_S VSRIQ_M_N_U]) -(define_int_iterator VSUBQ_M [VSUBQ_M_U VSUBQ_M_S]) -(define_int_iterator VCVTQ_M_N_TO_F [VCVTQ_M_N_TO_F_U VCVTQ_M_N_TO_F_S]) -(define_int_iterator VHSUBQ_M [VHSUBQ_M_S VHSUBQ_M_U]) -(define_int_iterator VSLIQ_M_N [VSLIQ_M_N_U VSLIQ_M_N_S]) -(define_int_iterator VRSHLQ_M [VRSHLQ_M_S VRSHLQ_M_U]) -(define_int_iterator VMINQ_M [VMINQ_M_S VMINQ_M_U]) -(define_int_iterator VMULLBQ_INT_M [VMULLBQ_INT_M_U VMULLBQ_INT_M_S]) -(define_int_iterator VMULHQ_M [VMULHQ_M_S VMULHQ_M_U]) -(define_int_iterator VMULQ_M [VMULQ_M_S VMULQ_M_U]) -(define_int_iterator VHSUBQ_M_N [VHSUBQ_M_N_S VHSUBQ_M_N_U]) -(define_int_iterator VHADDQ_M_N [VHADDQ_M_N_S VHADDQ_M_N_U]) -(define_int_iterator VORRQ_M [VORRQ_M_S VORRQ_M_U]) -(define_int_iterator VRMULHQ_M [VRMULHQ_M_U VRMULHQ_M_S]) -(define_int_iterator VQADDQ_M [VQADDQ_M_U VQADDQ_M_S]) -(define_int_iterator VRSHRQ_M_N [VRSHRQ_M_N_S VRSHRQ_M_N_U]) -(define_int_iterator VQSUBQ_M_N [VQSUBQ_M_N_U VQSUBQ_M_N_S]) -(define_int_iterator VADDQ_M [VADDQ_M_U VADDQ_M_S]) -(define_int_iterator VORNQ_M [VORNQ_M_U VORNQ_M_S]) -(define_int_iterator VRHADDQ_M [VRHADDQ_M_U VRHADDQ_M_S]) -(define_int_iterator VQSHLQ_M [VQSHLQ_M_U VQSHLQ_M_S]) -(define_int_iterator VANDQ_M [VANDQ_M_U VANDQ_M_S]) -(define_int_iterator VBICQ_M [VBICQ_M_U VBICQ_M_S]) -(define_int_iterator VSHLQ_M_N [VSHLQ_M_N_S VSHLQ_M_N_U]) -(define_int_iterator VCADDQ_ROT270_M [VCADDQ_ROT270_M_U VCADDQ_ROT270_M_S]) -(define_int_iterator VQRSHLQ_M [VQRSHLQ_M_U VQRSHLQ_M_S]) -(define_int_iterator VQADDQ_M_N [VQADDQ_M_N_U VQADDQ_M_N_S]) -(define_int_iterator VADDQ_M_N [VADDQ_M_N_S VADDQ_M_N_U]) -(define_int_iterator VMAXQ_M [VMAXQ_M_S VMAXQ_M_U]) -(define_int_iterator VQSUBQ_M [VQSUBQ_M_U VQSUBQ_M_S]) -(define_int_iterator VMLASQ_M_N [VMLASQ_M_N_U VMLASQ_M_N_S]) -(define_int_iterator VMLADAVAQ_P [VMLADAVAQ_P_U VMLADAVAQ_P_S]) -(define_int_iterator VBRSRQ_M_N [VBRSRQ_M_N_U VBRSRQ_M_N_S]) -(define_int_iterator VMULQ_M_N [VMULQ_M_N_U VMULQ_M_N_S]) -(define_int_iterator VCADDQ_ROT90_M [VCADDQ_ROT90_M_U VCADDQ_ROT90_M_S]) -(define_int_iterator VMULLTQ_INT_M [VMULLTQ_INT_M_S VMULLTQ_INT_M_U]) -(define_int_iterator VEORQ_M [VEORQ_M_S VEORQ_M_U]) -(define_int_iterator VSHRQ_M_N [VSHRQ_M_N_S VSHRQ_M_N_U]) -(define_int_iterator VSUBQ_M_N [VSUBQ_M_N_S VSUBQ_M_N_U]) -(define_int_iterator VHADDQ_M [VHADDQ_M_S VHADDQ_M_U]) -(define_int_iterator VABDQ_M [VABDQ_M_S VABDQ_M_U]) -(define_int_iterator VMLAQ_M_N [VMLAQ_M_N_S VMLAQ_M_N_U]) -(define_int_iterator VQSHLQ_M_N [VQSHLQ_M_N_S VQSHLQ_M_N_U]) -(define_int_iterator VMLALDAVAQ_P [VMLALDAVAQ_P_U VMLALDAVAQ_P_S]) -(define_int_iterator VMLALDAVAXQ_P [VMLALDAVAXQ_P_U VMLALDAVAXQ_P_S]) -(define_int_iterator VQRSHRNBQ_M_N [VQRSHRNBQ_M_N_U VQRSHRNBQ_M_N_S]) -(define_int_iterator VQRSHRNTQ_M_N [VQRSHRNTQ_M_N_S VQRSHRNTQ_M_N_U]) -(define_int_iterator VQSHRNBQ_M_N [VQSHRNBQ_M_N_U VQSHRNBQ_M_N_S]) -(define_int_iterator VQSHRNTQ_M_N [VQSHRNTQ_M_N_S VQSHRNTQ_M_N_U]) -(define_int_iterator VRSHRNBQ_M_N [VRSHRNBQ_M_N_U VRSHRNBQ_M_N_S]) -(define_int_iterator VRSHRNTQ_M_N [VRSHRNTQ_M_N_U VRSHRNTQ_M_N_S]) -(define_int_iterator VSHLLBQ_M_N [VSHLLBQ_M_N_U VSHLLBQ_M_N_S]) -(define_int_iterator VSHLLTQ_M_N [VSHLLTQ_M_N_U VSHLLTQ_M_N_S]) -(define_int_iterator VSHRNBQ_M_N [VSHRNBQ_M_N_S VSHRNBQ_M_N_U]) -(define_int_iterator VSHRNTQ_M_N [VSHRNTQ_M_N_S VSHRNTQ_M_N_U]) -(define_int_iterator VSTRWSBQ [VSTRWQSB_S VSTRWQSB_U]) -(define_int_iterator VSTRBSOQ [VSTRBQSO_S VSTRBQSO_U]) -(define_int_iterator VSTRBQ [VSTRBQ_S VSTRBQ_U]) -(define_int_iterator VLDRBGOQ [VLDRBQGO_S VLDRBQGO_U]) -(define_int_iterator VLDRBQ [VLDRBQ_S VLDRBQ_U]) -(define_int_iterator VLDRWGBQ [VLDRWQGB_S VLDRWQGB_U]) -(define_int_iterator VLD1Q [VLD1Q_S VLD1Q_U]) -(define_int_iterator VLDRHGOQ [VLDRHQGO_S VLDRHQGO_U]) -(define_int_iterator VLDRHGSOQ [VLDRHQGSO_S VLDRHQGSO_U]) -(define_int_iterator VLDRHQ [VLDRHQ_S VLDRHQ_U]) -(define_int_iterator VLDRWQ [VLDRWQ_S VLDRWQ_U]) -(define_int_iterator VLDRDGBQ [VLDRDQGB_S VLDRDQGB_U]) -(define_int_iterator VLDRDGOQ [VLDRDQGO_S VLDRDQGO_U]) -(define_int_iterator VLDRDGSOQ [VLDRDQGSO_S VLDRDQGSO_U]) -(define_int_iterator VLDRWGOQ [VLDRWQGO_S VLDRWQGO_U]) -(define_int_iterator VLDRWGSOQ [VLDRWQGSO_S VLDRWQGSO_U]) -(define_int_iterator VST1Q [VST1Q_S VST1Q_U]) -(define_int_iterator VSTRHSOQ [VSTRHQSO_S VSTRHQSO_U]) -(define_int_iterator VSTRHSSOQ [VSTRHQSSO_S VSTRHQSSO_U]) -(define_int_iterator VSTRHQ [VSTRHQ_S VSTRHQ_U]) -(define_int_iterator VSTRWQ [VSTRWQ_S VSTRWQ_U]) -(define_int_iterator VSTRDSBQ [VSTRDQSB_S VSTRDQSB_U]) -(define_int_iterator VSTRDSOQ [VSTRDQSO_S VSTRDQSO_U]) -(define_int_iterator VSTRDSSOQ [VSTRDQSSO_S VSTRDQSSO_U]) -(define_int_iterator VSTRWSOQ [VSTRWQSO_S VSTRWQSO_U]) -(define_int_iterator VSTRWSSOQ [VSTRWQSSO_S VSTRWQSSO_U]) -(define_int_iterator VSTRWSBWBQ [VSTRWQSBWB_S VSTRWQSBWB_U]) -(define_int_iterator VLDRWGBWBQ [VLDRWQGBWB_S VLDRWQGBWB_U]) -(define_int_iterator VSTRDSBWBQ [VSTRDQSBWB_S VSTRDQSBWB_U]) -(define_int_iterator VLDRDGBWBQ [VLDRDQGBWB_S VLDRDQGBWB_U]) -(define_int_iterator VADCIQ [VADCIQ_U VADCIQ_S]) -(define_int_iterator VADCIQ_M [VADCIQ_M_U VADCIQ_M_S]) -(define_int_iterator VSBCQ [VSBCQ_U VSBCQ_S]) -(define_int_iterator VSBCQ_M [VSBCQ_M_U VSBCQ_M_S]) -(define_int_iterator VSBCIQ [VSBCIQ_U VSBCIQ_S]) -(define_int_iterator VSBCIQ_M [VSBCIQ_M_U VSBCIQ_M_S]) -(define_int_iterator VADCQ [VADCQ_U VADCQ_S]) -(define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S]) -(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48]) -(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48]) -(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) - (define_insn "*mve_mov<mode>" [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w,w,r,w,Ux,w") (match_operand:MVE_types 1 "general_operand" "w,r,w,Dn,Uxi,r,Dm,w,Ul"))] @@ -1977,15 +1329,25 @@ ;; ;; [vmaxq_u, vmaxq_s]) ;; -(define_insn "mve_vmaxq_<supf><mode>" +(define_insn "mve_vmaxq_s<mode>" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VMAXQ)) + (smax:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) + ] + "TARGET_HAVE_MVE" + "vmax.%#<V_s_elem>\t%q0, %q1, %q2" + [(set_attr "type" "mve_move") +]) + +(define_insn "mve_vmaxq_u<mode>" + [ + (set (match_operand:MVE_2 0 "s_register_operand" "=w") + (umax:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE" - "vmax.<supf>%#<V_sz_elem>\t%q0, %q1, %q2" + "vmax.%#<V_u_elem>\t%q0, %q1, %q2" [(set_attr "type" "mve_move") ]) @@ -2037,15 +1399,25 @@ ;; ;; [vminq_s, vminq_u]) ;; -(define_insn "mve_vminq_<supf><mode>" +(define_insn "mve_vminq_s<mode>" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VMINQ)) + (smin:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) + ] + "TARGET_HAVE_MVE" + "vmin.%#<V_s_elem>\t%q0, %q1, %q2" + [(set_attr "type" "mve_move") +]) + +(define_insn "mve_vminq_u<mode>" + [ + (set (match_operand:MVE_2 0 "s_register_operand" "=w") + (umin:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE" - "vmin.<supf>%#<V_sz_elem>\t%q0, %q1, %q2" + "vmin.%#<V_u_elem>\t%q0, %q1, %q2" [(set_attr "type" "mve_move") ]) @@ -2199,6 +1571,17 @@ [(set_attr "type" "mve_move") ]) +(define_insn "mve_vmulq<mode>" + [ + (set (match_operand:MVE_2 0 "s_register_operand" "=w") + (mult:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) + ] + "TARGET_HAVE_MVE" + "vmul.i%#<V_sz_elem>\t%q0, %q1, %q2" + [(set_attr "type" "mve_move") +]) + ;; ;; [vornq_u, vornq_s]) ;; @@ -2574,6 +1957,17 @@ [(set_attr "type" "mve_move") ]) +(define_insn "mve_vsubq<mode>" + [ + (set (match_operand:MVE_2 0 "s_register_operand" "=w") + (minus:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) + ] + "TARGET_HAVE_MVE" + "vsub.i%#<V_sz_elem>\t%q0, %q1, %q2" + [(set_attr "type" "mve_move") +]) + ;; ;; [vabdq_f]) ;; @@ -3030,9 +2424,8 @@ (define_insn "mve_vmaxnmq_f<mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VMAXNMQ_F)) + (smax:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w") + (match_operand:MVE_0 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vmaxnm.f%#<V_sz_elem> %q0, %q1, %q2" @@ -3090,9 +2483,8 @@ (define_insn "mve_vminnmq_f<mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VMINNMQ_F)) + (smin:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w") + (match_operand:MVE_0 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vminnm.f%#<V_sz_elem> %q0, %q1, %q2" @@ -3210,9 +2602,8 @@ (define_insn "mve_vmulq_f<mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VMULQ_F)) + (mult:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w") + (match_operand:MVE_0 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vmul.f%#<V_sz_elem> %q0, %q1, %q2" @@ -3480,9 +2871,8 @@ (define_insn "mve_vsubq_f<mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VSUBQ_F)) + (minus:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w") + (match_operand:MVE_0 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vsub.f%#<V_sz_elem>\t%q0, %q1, %q2" @@ -4310,7 +3700,7 @@ (set_attr "length""8")]) ;; -;; [vqdmlahq_n_s, vqdmlahq_n_u]) +;; [vqdmlahq_n_s]) ;; (define_insn "mve_vqdmlahq_n_<supf><mode>" [ @@ -4326,6 +3716,22 @@ ]) ;; +;; [vqdmlashq_n_s]) +;; +(define_insn "mve_vqdmlashq_n_<supf><mode>" + [ + (set (match_operand:MVE_2 0 "s_register_operand" "=w") + (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") + (match_operand:MVE_2 2 "s_register_operand" "w") + (match_operand:<V_elem> 3 "s_register_operand" "r")] + VQDMLASHQ_N)) + ] + "TARGET_HAVE_MVE" + "vqdmlash.s%#<V_sz_elem>\t%q0, %q2, %3" + [(set_attr "type" "mve_move") +]) + +;; ;; [vqnegq_m_s]) ;; (define_insn "mve_vqnegq_m_s<mode>" @@ -4374,7 +3780,7 @@ ]) ;; -;; [vqrdmlahq_n_s, vqrdmlahq_n_u]) +;; [vqrdmlahq_n_s]) ;; (define_insn "mve_vqrdmlahq_n_<supf><mode>" [ @@ -4390,7 +3796,7 @@ ]) ;; -;; [vqrdmlashq_n_s, vqrdmlashq_n_u]) +;; [vqrdmlashq_n_s]) ;; (define_insn "mve_vqrdmlashq_n_<supf><mode>" [ @@ -6552,6 +5958,23 @@ (set_attr "length""8")]) ;; +;; [vqdmlashq_m_n_s]) +;; +(define_insn "mve_vqdmlashq_m_n_s<mode>" + [ + (set (match_operand:MVE_2 0 "s_register_operand" "=w") + (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") + (match_operand:MVE_2 2 "s_register_operand" "w") + (match_operand:<V_elem> 3 "s_register_operand" "r") + (match_operand:HI 4 "vpr_register_operand" "Up")] + VQDMLASHQ_M_N_S)) + ] + "TARGET_HAVE_MVE" + "vpst\;vqdmlasht.s%#<V_sz_elem>\t%q0, %q2, %3" + [(set_attr "type" "mve_move") + (set_attr "length""8")]) + +;; ;; [vqrdmlahq_m_n_s]) ;; (define_insn "mve_vqrdmlahq_m_n_s<mode>" @@ -7113,7 +6536,7 @@ (set_attr "length""8")]) ;; -;; [vmlaldavaxq_p_u, vmlaldavaxq_p_s]) +;; [vmlaldavaxq_p_s]) ;; (define_insn "mve_vmlaldavaxq_p_<supf><mode>" [ @@ -10315,38 +9738,10 @@ [(set_attr "type" "mve_move") (set_attr "length""8")]) -(define_expand "mve_vstrwq_scatter_base_wb_<supf>v4si" - [(match_operand:V4SI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V4SI 2 "s_register_operand" "w") - (unspec:V4SI [(const_int 0)] VSTRWSBWBQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_wb = gen_reg_rtx (V4SImode); - emit_insn ( - gen_mve_vstrwq_scatter_base_wb_<supf>v4si_insn (ignore_wb, operands[0], - operands[1], operands[2])); - DONE; -}) - -(define_expand "mve_vstrwq_scatter_base_wb_add_<supf>v4si" - [(match_operand:V4SI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V4SI 2 "s_register_operand" "0") - (unspec:V4SI [(const_int 0)] VSTRWSBWBQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_vec = gen_reg_rtx (V4SImode); - emit_insn ( - gen_mve_vstrwq_scatter_base_wb_<supf>v4si_insn (operands[0], operands[2], - operands[1], ignore_vec)); - DONE; -}) - ;; -;; [vstrwq_scatter_base_wb_s vstrdq_scatter_base_wb_u] +;; [vstrwq_scatter_base_wb_s vstrwq_scatter_base_wb_u] ;; -(define_insn "mve_vstrwq_scatter_base_wb_<supf>v4si_insn" +(define_insn "mve_vstrwq_scatter_base_wb_<supf>v4si" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:V4SI 1 "s_register_operand" "0") @@ -10368,42 +9763,10 @@ } [(set_attr "length" "4")]) -(define_expand "mve_vstrwq_scatter_base_wb_p_<supf>v4si" - [(match_operand:V4SI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand") - (unspec:V4SI [(const_int 0)] VSTRWSBWBQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_wb = gen_reg_rtx (V4SImode); - emit_insn ( - gen_mve_vstrwq_scatter_base_wb_p_<supf>v4si_insn (ignore_wb, operands[0], - operands[1], operands[2], - operands[3])); - DONE; -}) - -(define_expand "mve_vstrwq_scatter_base_wb_p_add_<supf>v4si" - [(match_operand:V4SI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V4SI 2 "s_register_operand" "0") - (match_operand:HI 3 "vpr_register_operand") - (unspec:V4SI [(const_int 0)] VSTRWSBWBQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_vec = gen_reg_rtx (V4SImode); - emit_insn ( - gen_mve_vstrwq_scatter_base_wb_p_<supf>v4si_insn (operands[0], operands[2], - operands[1], ignore_vec, - operands[3])); - DONE; -}) - ;; ;; [vstrwq_scatter_base_wb_p_s vstrwq_scatter_base_wb_p_u] ;; -(define_insn "mve_vstrwq_scatter_base_wb_p_<supf>v4si_insn" +(define_insn "mve_vstrwq_scatter_base_wb_p_<supf>v4si" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:V4SI 1 "s_register_operand" "0") @@ -10426,38 +9789,10 @@ } [(set_attr "length" "8")]) -(define_expand "mve_vstrwq_scatter_base_wb_fv4sf" - [(match_operand:V4SI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V4SF 2 "s_register_operand" "w") - (unspec:V4SI [(const_int 0)] VSTRWQSBWB_F)] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ignore_wb = gen_reg_rtx (V4SImode); - emit_insn ( - gen_mve_vstrwq_scatter_base_wb_fv4sf_insn (ignore_wb,operands[0], - operands[1], operands[2])); - DONE; -}) - -(define_expand "mve_vstrwq_scatter_base_wb_add_fv4sf" - [(match_operand:V4SI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V4SI 2 "s_register_operand" "0") - (unspec:V4SI [(const_int 0)] VSTRWQSBWB_F)] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ignore_vec = gen_reg_rtx (V4SFmode); - emit_insn ( - gen_mve_vstrwq_scatter_base_wb_fv4sf_insn (operands[0], operands[2], - operands[1], ignore_vec)); - DONE; -}) - ;; ;; [vstrwq_scatter_base_wb_f] ;; -(define_insn "mve_vstrwq_scatter_base_wb_fv4sf_insn" +(define_insn "mve_vstrwq_scatter_base_wb_fv4sf" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:V4SI 1 "s_register_operand" "0") @@ -10479,42 +9814,10 @@ } [(set_attr "length" "4")]) -(define_expand "mve_vstrwq_scatter_base_wb_p_fv4sf" - [(match_operand:V4SI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V4SF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand") - (unspec:V4SI [(const_int 0)] VSTRWQSBWB_F)] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ignore_wb = gen_reg_rtx (V4SImode); - emit_insn ( - gen_mve_vstrwq_scatter_base_wb_p_fv4sf_insn (ignore_wb, operands[0], - operands[1], operands[2], - operands[3])); - DONE; -}) - -(define_expand "mve_vstrwq_scatter_base_wb_p_add_fv4sf" - [(match_operand:V4SI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V4SI 2 "s_register_operand" "0") - (match_operand:HI 3 "vpr_register_operand") - (unspec:V4SI [(const_int 0)] VSTRWQSBWB_F)] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ignore_vec = gen_reg_rtx (V4SFmode); - emit_insn ( - gen_mve_vstrwq_scatter_base_wb_p_fv4sf_insn (operands[0], operands[2], - operands[1], ignore_vec, - operands[3])); - DONE; -}) - ;; ;; [vstrwq_scatter_base_wb_p_f] ;; -(define_insn "mve_vstrwq_scatter_base_wb_p_fv4sf_insn" +(define_insn "mve_vstrwq_scatter_base_wb_p_fv4sf" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:V4SI 1 "s_register_operand" "0") @@ -10537,38 +9840,10 @@ } [(set_attr "length" "8")]) -(define_expand "mve_vstrdq_scatter_base_wb_<supf>v2di" - [(match_operand:V2DI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V2DI 2 "s_register_operand" "w") - (unspec:V2DI [(const_int 0)] VSTRDSBWBQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_wb = gen_reg_rtx (V2DImode); - emit_insn ( - gen_mve_vstrdq_scatter_base_wb_<supf>v2di_insn (ignore_wb, operands[0], - operands[1], operands[2])); - DONE; -}) - -(define_expand "mve_vstrdq_scatter_base_wb_add_<supf>v2di" - [(match_operand:V2DI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V2DI 2 "s_register_operand" "0") - (unspec:V2DI [(const_int 0)] VSTRDSBWBQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_vec = gen_reg_rtx (V2DImode); - emit_insn ( - gen_mve_vstrdq_scatter_base_wb_<supf>v2di_insn (operands[0], operands[2], - operands[1], ignore_vec)); - DONE; -}) - ;; ;; [vstrdq_scatter_base_wb_s vstrdq_scatter_base_wb_u] ;; -(define_insn "mve_vstrdq_scatter_base_wb_<supf>v2di_insn" +(define_insn "mve_vstrdq_scatter_base_wb_<supf>v2di" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:V2DI 1 "s_register_operand" "0") @@ -10590,42 +9865,10 @@ } [(set_attr "length" "4")]) -(define_expand "mve_vstrdq_scatter_base_wb_p_<supf>v2di" - [(match_operand:V2DI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V2DI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand") - (unspec:V2DI [(const_int 0)] VSTRDSBWBQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_wb = gen_reg_rtx (V2DImode); - emit_insn ( - gen_mve_vstrdq_scatter_base_wb_p_<supf>v2di_insn (ignore_wb, operands[0], - operands[1], operands[2], - operands[3])); - DONE; -}) - -(define_expand "mve_vstrdq_scatter_base_wb_p_add_<supf>v2di" - [(match_operand:V2DI 0 "s_register_operand" "=w") - (match_operand:SI 1 "mve_vldrd_immediate" "Ri") - (match_operand:V2DI 2 "s_register_operand" "0") - (match_operand:HI 3 "vpr_register_operand") - (unspec:V2DI [(const_int 0)] VSTRDSBWBQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_vec = gen_reg_rtx (V2DImode); - emit_insn ( - gen_mve_vstrdq_scatter_base_wb_p_<supf>v2di_insn (operands[0], operands[2], - operands[1], ignore_vec, - operands[3])); - DONE; -}) - ;; ;; [vstrdq_scatter_base_wb_p_s vstrdq_scatter_base_wb_p_u] ;; -(define_insn "mve_vstrdq_scatter_base_wb_p_<supf>v2di_insn" +(define_insn "mve_vstrdq_scatter_base_wb_p_<supf>v2di" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:V2DI 1 "s_register_operand" "0") @@ -10643,7 +9886,7 @@ ops[0] = operands[1]; ops[1] = operands[2]; ops[2] = operands[3]; - output_asm_insn ("vpst\;\tvstrdt.u64\t%q2, [%q0, %1]!",ops); + output_asm_insn ("vpst;vstrdt.u64\t%q2, [%q0, %1]!",ops); return ""; } [(set_attr "length" "8")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 3e7b51d..2d76769 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -501,7 +501,7 @@ [(set (match_operand:VDQ 0 "s_register_operand" "=w") (plus:VDQ (match_operand:VDQ 1 "s_register_operand" "w") (match_operand:VDQ 2 "s_register_operand" "w")))] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + "ARM_HAVE_NEON_<MODE>_ARITH" "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2" [(set (attr "type") (if_then_else (match_test "<Is_float_mode>") @@ -509,54 +509,11 @@ (const_string "neon_add<q>")))] ) -;; As with SFmode, full support for HFmode vector arithmetic is only available -;; when flag-unsafe-math-optimizations is enabled. - -;; Add pattern with modes V8HF and V4HF is split into separate patterns to add -;; support for standard pattern addv8hf3 in MVE. Following pattern is called -;; from "addv8hf3" standard pattern inside vec-common.md file. - -(define_insn "addv8hf3_neon" - [(set - (match_operand:V8HF 0 "s_register_operand" "=w") - (plus:V8HF - (match_operand:V8HF 1 "s_register_operand" "w") - (match_operand:V8HF 2 "s_register_operand" "w")))] - "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" - "vadd.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set_attr "type" "neon_fp_addsub_s_q")] -) - -(define_insn "addv4hf3" - [(set - (match_operand:V4HF 0 "s_register_operand" "=w") - (plus:V4HF - (match_operand:V4HF 1 "s_register_operand" "w") - (match_operand:V4HF 2 "s_register_operand" "w")))] - "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" - "vadd.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set_attr "type" "neon_fp_addsub_s_q")] -) - -(define_insn "add<mode>3_fp16" - [(set - (match_operand:VH 0 "s_register_operand" "=w") - (plus:VH - (match_operand:VH 1 "s_register_operand" "w") - (match_operand:VH 2 "s_register_operand" "w")))] - "TARGET_NEON_FP16INST" - "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set (attr "type") - (if_then_else (match_test "<Is_float_mode>") - (const_string "neon_fp_addsub_s<q>") - (const_string "neon_add<q>")))] -) - (define_insn "*sub<mode>3_neon" [(set (match_operand:VDQ 0 "s_register_operand" "=w") (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w") (match_operand:VDQ 2 "s_register_operand" "w")))] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + "ARM_HAVE_NEON_<MODE>_ARITH" "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2" [(set (attr "type") (if_then_else (match_test "<Is_float_mode>") @@ -564,33 +521,11 @@ (const_string "neon_sub<q>")))] ) -(define_insn "sub<mode>3" - [(set - (match_operand:VH 0 "s_register_operand" "=w") - (minus:VH - (match_operand:VH 1 "s_register_operand" "w") - (match_operand:VH 2 "s_register_operand" "w")))] - "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" - "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set_attr "type" "neon_sub<q>")] -) - -(define_insn "sub<mode>3_fp16" - [(set - (match_operand:VH 0 "s_register_operand" "=w") - (minus:VH - (match_operand:VH 1 "s_register_operand" "w") - (match_operand:VH 2 "s_register_operand" "w")))] - "TARGET_NEON_FP16INST" - "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set_attr "type" "neon_sub<q>")] -) - (define_insn "*mul<mode>3_neon" [(set (match_operand:VDQW 0 "s_register_operand" "=w") (mult:VDQW (match_operand:VDQW 1 "s_register_operand" "w") (match_operand:VDQW 2 "s_register_operand" "w")))] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + "ARM_HAVE_NEON_<MODE>_ARITH" "vmul.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2" [(set (attr "type") (if_then_else (match_test "<Is_float_mode>") @@ -635,7 +570,7 @@ (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w") (match_operand:VDQW 3 "s_register_operand" "w")) (match_operand:VDQW 1 "s_register_operand" "0")))] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + "ARM_HAVE_NEON_<MODE>_ARITH" "vmla.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3" [(set (attr "type") (if_then_else (match_test "<Is_float_mode>") @@ -648,7 +583,7 @@ (plus:VH (mult:VH (match_operand:VH 2 "s_register_operand" "w") (match_operand:VH 3 "s_register_operand" "w")) (match_operand:VH 1 "s_register_operand" "0")))] - "TARGET_NEON_FP16INST && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + "ARM_HAVE_NEON_<MODE>_ARITH" "vmla.f16\t%<V_reg>0, %<V_reg>2, %<V_reg>3" [(set_attr "type" "neon_fp_mla_s<q>")] ) @@ -658,7 +593,7 @@ (minus:VDQW (match_operand:VDQW 1 "s_register_operand" "0") (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w") (match_operand:VDQW 3 "s_register_operand" "w"))))] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + "ARM_HAVE_NEON_<MODE>_ARITH" "vmls.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3" [(set (attr "type") (if_then_else (match_test "<Is_float_mode>") @@ -676,7 +611,7 @@ (fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w") (match_operand:VCVTF 2 "register_operand" "w") (match_operand:VCVTF 3 "register_operand" "0")))] - "TARGET_NEON && TARGET_FMA && flag_unsafe_math_optimizations" + "ARM_HAVE_NEON_<MODE>_ARITH && TARGET_FMA" "vfma.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" [(set_attr "type" "neon_fp_mla_s<q>")] ) @@ -697,18 +632,7 @@ (match_operand:VH 1 "register_operand" "w") (match_operand:VH 2 "register_operand" "w") (match_operand:VH 3 "register_operand" "0")))] - "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" - "vfma.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set_attr "type" "neon_fp_mla_s<q>")] -) - -(define_insn "fma<VH:mode>4_intrinsic" - [(set (match_operand:VH 0 "register_operand" "=w") - (fma:VH - (match_operand:VH 1 "register_operand" "w") - (match_operand:VH 2 "register_operand" "w") - (match_operand:VH 3 "register_operand" "0")))] - "TARGET_NEON_FP16INST" + "ARM_HAVE_NEON_<MODE>_ARITH" "vfma.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" [(set_attr "type" "neon_fp_mla_s<q>")] ) @@ -718,7 +642,7 @@ (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w")) (match_operand:VCVTF 2 "register_operand" "w") (match_operand:VCVTF 3 "register_operand" "0")))] - "TARGET_NEON && TARGET_FMA && flag_unsafe_math_optimizations" + "ARM_HAVE_NEON_<MODE>_ARITH && TARGET_FMA" "vfms.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" [(set_attr "type" "neon_fp_mla_s<q>")] ) @@ -1238,7 +1162,7 @@ (parallel [(const_int 0) (const_int 1)])) (vec_select:V2SF (match_dup 1) (parallel [(const_int 2) (const_int 3)]))))] - "TARGET_NEON && flag_unsafe_math_optimizations" + "ARM_HAVE_NEON_V4SF_ARITH" "<VQH_mnem>.f32\t%P0, %e1, %f1" [(set_attr "vqh_mnem" "<VQH_mnem>") (set_attr "type" "neon_fp_reduc_<VQH_type>_s_q")] @@ -1305,7 +1229,7 @@ (define_expand "reduc_plus_scal_<mode>" [(match_operand:<V_elem> 0 "nonimmediate_operand") (match_operand:VD 1 "s_register_operand")] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + "ARM_HAVE_NEON_<MODE>_ARITH" { rtx vec = gen_reg_rtx (<MODE>mode); neon_pairwise_reduce (vec, operands[1], <MODE>mode, @@ -1318,8 +1242,7 @@ (define_expand "reduc_plus_scal_<mode>" [(match_operand:<V_elem> 0 "nonimmediate_operand") (match_operand:VQ 1 "s_register_operand")] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations) - && !BYTES_BIG_ENDIAN" + "ARM_HAVE_NEON_<MODE>_ARITH && !BYTES_BIG_ENDIAN" { rtx step1 = gen_reg_rtx (<V_HALF>mode); @@ -1354,7 +1277,7 @@ (define_expand "reduc_smin_scal_<mode>" [(match_operand:<V_elem> 0 "nonimmediate_operand") (match_operand:VD 1 "s_register_operand")] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + "ARM_HAVE_NEON_<MODE>_ARITH" { rtx vec = gen_reg_rtx (<MODE>mode); @@ -1368,8 +1291,7 @@ (define_expand "reduc_smin_scal_<mode>" [(match_operand:<V_elem> 0 "nonimmediate_operand") (match_operand:VQ 1 "s_register_operand")] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations) - && !BYTES_BIG_ENDIAN" + "ARM_HAVE_NEON_<MODE>_ARITH && !BYTES_BIG_ENDIAN" { rtx step1 = gen_reg_rtx (<V_HALF>mode); @@ -1382,7 +1304,7 @@ (define_expand "reduc_smax_scal_<mode>" [(match_operand:<V_elem> 0 "nonimmediate_operand") (match_operand:VD 1 "s_register_operand")] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + "ARM_HAVE_NEON_<MODE>_ARITH" { rtx vec = gen_reg_rtx (<MODE>mode); neon_pairwise_reduce (vec, operands[1], <MODE>mode, @@ -1395,8 +1317,7 @@ (define_expand "reduc_smax_scal_<mode>" [(match_operand:<V_elem> 0 "nonimmediate_operand") (match_operand:VQ 1 "s_register_operand")] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations) - && !BYTES_BIG_ENDIAN" + "ARM_HAVE_NEON_<MODE>_ARITH && !BYTES_BIG_ENDIAN" { rtx step1 = gen_reg_rtx (<V_HALF>mode); @@ -1573,6 +1494,30 @@ [(set_attr "type" "neon_qsub<q>")] ) +(define_expand "vec_cmp<mode><v_cmp_result>" + [(set (match_operand:<V_cmp_result> 0 "s_register_operand") + (match_operator:<V_cmp_result> 1 "comparison_operator" + [(match_operand:VDQW 2 "s_register_operand") + (match_operand:VDQW 3 "reg_or_zero_operand")]))] + "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + +(define_expand "vec_cmpu<mode><mode>" + [(set (match_operand:VDQIW 0 "s_register_operand") + (match_operator:VDQIW 1 "comparison_operator" + [(match_operand:VDQIW 2 "s_register_operand") + (match_operand:VDQIW 3 "reg_or_zero_operand")]))] + "TARGET_NEON" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + ;; Conditional instructions. These are comparisons with conditional moves for ;; vectors. They perform the assignment: ;; @@ -1586,230 +1531,53 @@ (if_then_else:VDQW (match_operator 3 "comparison_operator" [(match_operand:VDQW 4 "s_register_operand") - (match_operand:VDQW 5 "nonmemory_operand")]) + (match_operand:VDQW 5 "reg_or_zero_operand")]) (match_operand:VDQW 1 "s_register_operand") (match_operand:VDQW 2 "s_register_operand")))] "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" { - int inverse = 0; - int use_zero_form = 0; - int swap_bsl_operands = 0; - rtx mask = gen_reg_rtx (<V_cmp_result>mode); - rtx tmp = gen_reg_rtx (<V_cmp_result>mode); - - rtx (*base_comparison) (rtx, rtx, rtx); - rtx (*complimentary_comparison) (rtx, rtx, rtx); - - switch (GET_CODE (operands[3])) - { - case GE: - case GT: - case LE: - case LT: - case EQ: - if (operands[5] == CONST0_RTX (<MODE>mode)) - { - use_zero_form = 1; - break; - } - /* Fall through. */ - default: - if (!REG_P (operands[5])) - operands[5] = force_reg (<MODE>mode, operands[5]); - } - - switch (GET_CODE (operands[3])) - { - case LT: - case UNLT: - inverse = 1; - /* Fall through. */ - case GE: - case UNGE: - case ORDERED: - case UNORDERED: - base_comparison = gen_neon_vcge<mode>; - complimentary_comparison = gen_neon_vcgt<mode>; - break; - case LE: - case UNLE: - inverse = 1; - /* Fall through. */ - case GT: - case UNGT: - base_comparison = gen_neon_vcgt<mode>; - complimentary_comparison = gen_neon_vcge<mode>; - break; - case EQ: - case NE: - case UNEQ: - base_comparison = gen_neon_vceq<mode>; - complimentary_comparison = gen_neon_vceq<mode>; - break; - default: - gcc_unreachable (); - } - - switch (GET_CODE (operands[3])) - { - case LT: - case LE: - case GT: - case GE: - case EQ: - /* The easy case. Here we emit one of vcge, vcgt or vceq. - As a LT b <=> b GE a && a LE b <=> b GT a. Our transformations are: - a GE b -> a GE b - a GT b -> a GT b - a LE b -> b GE a - a LT b -> b GT a - a EQ b -> a EQ b - Note that there also exist direct comparison against 0 forms, - so catch those as a special case. */ - if (use_zero_form) - { - inverse = 0; - switch (GET_CODE (operands[3])) - { - case LT: - base_comparison = gen_neon_vclt<mode>; - break; - case LE: - base_comparison = gen_neon_vcle<mode>; - break; - default: - /* Do nothing, other zero form cases already have the correct - base_comparison. */ - break; - } - } - - if (!inverse) - emit_insn (base_comparison (mask, operands[4], operands[5])); - else - emit_insn (complimentary_comparison (mask, operands[5], operands[4])); - break; - case UNLT: - case UNLE: - case UNGT: - case UNGE: - case NE: - /* Vector compare returns false for lanes which are unordered, so if we use - the inverse of the comparison we actually want to emit, then - swap the operands to BSL, we will end up with the correct result. - Note that a NE NaN and NaN NE b are true for all a, b. - - Our transformations are: - a GE b -> !(b GT a) - a GT b -> !(b GE a) - a LE b -> !(a GT b) - a LT b -> !(a GE b) - a NE b -> !(a EQ b) */ - - if (inverse) - emit_insn (base_comparison (mask, operands[4], operands[5])); - else - emit_insn (complimentary_comparison (mask, operands[5], operands[4])); - - swap_bsl_operands = 1; - break; - case UNEQ: - /* We check (a > b || b > a). combining these comparisons give us - true iff !(a != b && a ORDERED b), swapping the operands to BSL - will then give us (a == b || a UNORDERED b) as intended. */ - - emit_insn (gen_neon_vcgt<mode> (mask, operands[4], operands[5])); - emit_insn (gen_neon_vcgt<mode> (tmp, operands[5], operands[4])); - emit_insn (gen_ior<v_cmp_result>3 (mask, mask, tmp)); - swap_bsl_operands = 1; - break; - case UNORDERED: - /* Operands are ORDERED iff (a > b || b >= a). - Swapping the operands to BSL will give the UNORDERED case. */ - swap_bsl_operands = 1; - /* Fall through. */ - case ORDERED: - emit_insn (gen_neon_vcgt<mode> (tmp, operands[4], operands[5])); - emit_insn (gen_neon_vcge<mode> (mask, operands[5], operands[4])); - emit_insn (gen_ior<v_cmp_result>3 (mask, mask, tmp)); - break; - default: - gcc_unreachable (); - } + arm_expand_vcond (operands, <V_cmp_result>mode); + DONE; +}) - if (swap_bsl_operands) - emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[2], - operands[1])); - else - emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[1], - operands[2])); +(define_expand "vcond<V_cvtto><mode>" + [(set (match_operand:<V_CVTTO> 0 "s_register_operand") + (if_then_else:<V_CVTTO> + (match_operator 3 "comparison_operator" + [(match_operand:V32 4 "s_register_operand") + (match_operand:V32 5 "reg_or_zero_operand")]) + (match_operand:<V_CVTTO> 1 "s_register_operand") + (match_operand:<V_CVTTO> 2 "s_register_operand")))] + "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" +{ + arm_expand_vcond (operands, <V_cmp_result>mode); DONE; }) -(define_expand "vcondu<mode><mode>" - [(set (match_operand:VDQIW 0 "s_register_operand") - (if_then_else:VDQIW +(define_expand "vcondu<mode><v_cmp_result>" + [(set (match_operand:VDQW 0 "s_register_operand") + (if_then_else:VDQW (match_operator 3 "arm_comparison_operator" - [(match_operand:VDQIW 4 "s_register_operand") - (match_operand:VDQIW 5 "s_register_operand")]) - (match_operand:VDQIW 1 "s_register_operand") - (match_operand:VDQIW 2 "s_register_operand")))] + [(match_operand:<V_cmp_result> 4 "s_register_operand") + (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")]) + (match_operand:VDQW 1 "s_register_operand") + (match_operand:VDQW 2 "s_register_operand")))] "TARGET_NEON" { - rtx mask; - int inverse = 0, immediate_zero = 0; - - mask = gen_reg_rtx (<V_cmp_result>mode); - - if (operands[5] == CONST0_RTX (<MODE>mode)) - immediate_zero = 1; - else if (!REG_P (operands[5])) - operands[5] = force_reg (<MODE>mode, operands[5]); - - switch (GET_CODE (operands[3])) - { - case GEU: - emit_insn (gen_neon_vcgeu<mode> (mask, operands[4], operands[5])); - break; - - case GTU: - emit_insn (gen_neon_vcgtu<mode> (mask, operands[4], operands[5])); - break; - - case EQ: - emit_insn (gen_neon_vceq<mode> (mask, operands[4], operands[5])); - break; - - case LEU: - if (immediate_zero) - emit_insn (gen_neon_vcle<mode> (mask, operands[4], operands[5])); - else - emit_insn (gen_neon_vcgeu<mode> (mask, operands[5], operands[4])); - break; - - case LTU: - if (immediate_zero) - emit_insn (gen_neon_vclt<mode> (mask, operands[4], operands[5])); - else - emit_insn (gen_neon_vcgtu<mode> (mask, operands[5], operands[4])); - break; - - case NE: - emit_insn (gen_neon_vceq<mode> (mask, operands[4], operands[5])); - inverse = 1; - break; - - default: - gcc_unreachable (); - } - - if (inverse) - emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[2], - operands[1])); - else - emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[1], - operands[2])); + arm_expand_vcond (operands, <V_cmp_result>mode); + DONE; +}) +(define_expand "vcond_mask_<mode><v_cmp_result>" + [(set (match_operand:VDQW 0 "s_register_operand") + (if_then_else:VDQW + (match_operand:<V_cmp_result> 3 "s_register_operand") + (match_operand:VDQW 1 "s_register_operand") + (match_operand:VDQW 2 "s_register_operand")))] + "TARGET_NEON" +{ + emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1], + operands[2])); DONE; }) @@ -1823,7 +1591,7 @@ (match_operand:VCVTF 2 "s_register_operand")] "TARGET_NEON" { - if (!<Is_float_mode> || flag_unsafe_math_optimizations) + if (ARM_HAVE_NEON_<MODE>_ARITH) emit_insn (gen_add<mode>3 (operands[0], operands[1], operands[2])); else emit_insn (gen_neon_vadd<mode>_unspec (operands[0], operands[1], @@ -1837,7 +1605,7 @@ (match_operand:VH 2 "s_register_operand")] "TARGET_NEON_FP16INST" { - emit_insn (gen_add<mode>3_fp16 (operands[0], operands[1], operands[2])); + emit_insn (gen_add<mode>3 (operands[0], operands[1], operands[2])); DONE; }) @@ -1847,7 +1615,7 @@ (match_operand:VH 2 "s_register_operand")] "TARGET_NEON_FP16INST" { - emit_insn (gen_sub<mode>3_fp16 (operands[0], operands[1], operands[2])); + emit_insn (gen_sub<mode>3 (operands[0], operands[1], operands[2])); DONE; }) @@ -1942,17 +1710,6 @@ (const_string "neon_mul_<V_elem_ch><q>")))] ) -(define_insn "mul<mode>3" - [(set - (match_operand:VH 0 "s_register_operand" "=w") - (mult:VH - (match_operand:VH 1 "s_register_operand" "w") - (match_operand:VH 2 "s_register_operand" "w")))] - "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" - "vmul.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set_attr "type" "neon_mul_<VH_elem_ch><q>")] -) - (define_insn "neon_vmulf<mode>" [(set (match_operand:VH 0 "s_register_operand" "=w") @@ -1971,7 +1728,7 @@ (match_operand:VDQW 3 "s_register_operand")] "TARGET_NEON" { - if (!<Is_float_mode> || flag_unsafe_math_optimizations) + if (ARM_HAVE_NEON_<MODE>_ARITH) emit_insn (gen_mul<mode>3add<mode>_neon (operands[0], operands[1], operands[2], operands[3])); else @@ -1999,8 +1756,8 @@ (match_operand:VH 3 "s_register_operand")] "TARGET_NEON_FP16INST" { - emit_insn (gen_fma<mode>4_intrinsic (operands[0], operands[2], operands[3], - operands[1])); + emit_insn (gen_fma<mode>4 (operands[0], operands[2], operands[3], + operands[1])); DONE; }) @@ -2462,7 +2219,7 @@ (match_operand:VDQW 3 "s_register_operand")] "TARGET_NEON" { - if (!<Is_float_mode> || flag_unsafe_math_optimizations) + if (ARM_HAVE_NEON_<MODE>_ARITH) emit_insn (gen_mul<mode>3neg<mode>add<mode>_neon (operands[0], operands[1], operands[2], operands[3])); else @@ -2569,7 +2326,7 @@ (match_operand:VCVTF 2 "s_register_operand")] "TARGET_NEON" { - if (!<Is_float_mode> || flag_unsafe_math_optimizations) + if (ARM_HAVE_NEON_<MODE>_ARITH) emit_insn (gen_sub<mode>3 (operands[0], operands[1], operands[2])); else emit_insn (gen_neon_vsub<mode>_unspec (operands[0], operands[1], @@ -2644,7 +2401,7 @@ ;; These may expand to an UNSPEC pattern when a floating point mode is used ;; without unsafe math optimizations. -(define_expand "neon_vc<cmp_op><mode>" +(define_expand "@neon_vc<cmp_op><mode>" [(match_operand:<V_cmp_result> 0 "s_register_operand") (neg:<V_cmp_result> (COMPARISONS:VDQW (match_operand:VDQW 1 "s_register_operand") @@ -2684,7 +2441,7 @@ } ) -(define_insn "neon_vc<cmp_op><mode>_insn" +(define_insn "@neon_vc<cmp_op><mode>_insn" [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w") (neg:<V_cmp_result> (COMPARISONS:<V_cmp_result> @@ -2728,7 +2485,7 @@ [(set_attr "type" "neon_fp_compare_s<q>")] ) -(define_expand "neon_vc<cmp_op><mode>" +(define_expand "@neon_vc<cmp_op><mode>" [(match_operand:<V_cmp_result> 0 "s_register_operand") (neg:<V_cmp_result> (COMPARISONS:VH @@ -2794,7 +2551,7 @@ } [(set_attr "type" "neon_fp_compare_s<q>")]) -(define_insn "neon_vc<cmp_op>u<mode>" +(define_insn "@neon_vc<code><mode>" [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w") (neg:<V_cmp_result> (GTUGEU:<V_cmp_result> @@ -4751,7 +4508,7 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_bsl<q>")] ) -(define_expand "neon_vbsl<mode>" +(define_expand "@neon_vbsl<mode>" [(set (match_operand:VDQX 0 "s_register_operand") (unspec:VDQX [(match_operand:<V_cmp_result> 1 "s_register_operand") (match_operand:VDQX 2 "s_register_operand") @@ -6658,7 +6415,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:VF 0 "s_register_operand" "=w") (abs:VF (minus:VF (match_operand:VF 1 "s_register_operand" "w") (match_operand:VF 2 "s_register_operand" "w"))))] - "TARGET_NEON && flag_unsafe_math_optimizations" + "ARM_HAVE_NEON_<MODE>_ARITH" "vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2" [(set_attr "type" "neon_fp_abd_s<q>")] ) @@ -6668,7 +6425,7 @@ if (BYTES_BIG_ENDIAN) (abs:VF (unspec:VF [(match_operand:VF 1 "s_register_operand" "w") (match_operand:VF 2 "s_register_operand" "w")] UNSPEC_VSUB)))] - "TARGET_NEON && flag_unsafe_math_optimizations" + "ARM_HAVE_NEON_<MODE>_ARITH" "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2" [(set_attr "type" "neon_fp_abd_s<q>")] ) diff --git a/gcc/config/arm/parsecpu.awk b/gcc/config/arm/parsecpu.awk index 7fc3754..9423e8a 100644 --- a/gcc/config/arm/parsecpu.awk +++ b/gcc/config/arm/parsecpu.awk @@ -190,6 +190,23 @@ function gen_isa () { ORS = z print "\n" } + + print "struct fbit_implication {" + print " /* Represents a feature implication, where:" + print " ante IMPLIES cons" + print " meaning that if ante is enabled then we should" + print " also implicitly enable cons. */" + print " enum isa_feature ante;" + print " enum isa_feature cons;" + print "};\n" + print "static const struct fbit_implication all_implied_fbits[] =" + print "{" + for (impl in implied_bits) { + split (impl, impl_parts, SUBSEP) + print " { isa_bit_" impl_parts[2] ", isa_bit_" impl_parts[1] " }," + } + print " { isa_nobit, isa_nobit }" + print "};\n" } function gen_data () { @@ -600,6 +617,40 @@ BEGIN { parse_ok = 1 } +/^define implied / { + if (NF < 4) fatal("syntax: define implied <name> [<feature-or-fgroup>]+\n" \ + "Implied bits must be defined with at least one antecedent.") + toplevel() + fbit = $3 + if (fbit in features) fatal("implied feature " fbit " aliases a real feature") + if (fbit in fgroup) fatal("implied feature " fbit " aliases a feature group") + fcount = NF + features[fbit] = 1 + for (n = 4; n <= fcount; n++) { + ante = $n + if (fbit == ante) fatal("feature cannot imply itself") + else if (ante in features) { + for (impl in implied_bits) { + split(impl, impl_sep, SUBSEP) + if (ante == impl_sep[1]) + fatal(ante " implies implied bit " fbit \ + ". Chained implications not currently supported") + } + implied_bits[fbit, ante] = 1 + } else if (ante in fgroup) { + for (bitcomb in fgrp_bits) { + split(bitcomb, bitsep, SUBSEP) + if (bitsep[1] == ante) { + implied_bits[fbit, bitsep[2]] = 1 + } + } + } else { + fatal("implied bit antecedent " ante " unrecognized") + } + } + parse_ok = 1 +} + /^begin fpu / { if (NF != 3) fatal("syntax: begin fpu <name>") toplevel() diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 0a2399d..a3844e9 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -519,3 +519,803 @@ UNSPEC_BFMAB UNSPEC_BFMAT ]) + +;; Enumerators for MVE unspecs. +(define_c_enum "unspec" [ + VST4Q + VRNDXQ_F + VRNDQ_F + VRNDPQ_F + VRNDNQ_F + VRNDMQ_F + VRNDAQ_F + VREV64Q_F + VNEGQ_F + VDUPQ_N_F + VABSQ_F + VREV32Q_F + VCVTTQ_F32_F16 + VCVTBQ_F32_F16 + VCVTQ_TO_F_S + VQNEGQ_S + VCVTQ_TO_F_U + VREV16Q_S + VREV16Q_U + VADDLVQ_S + VMVNQ_N_S + VMVNQ_N_U + VCVTAQ_S + VCVTAQ_U + VREV64Q_S + VREV64Q_U + VQABSQ_S + VNEGQ_S + VMVNQ_S + VMVNQ_U + VDUPQ_N_U + VDUPQ_N_S + VCLZQ_U + VCLZQ_S + VCLSQ_S + VADDVQ_S + VADDVQ_U + VABSQ_S + VREV32Q_U + VREV32Q_S + VMOVLTQ_U + VMOVLTQ_S + VMOVLBQ_S + VMOVLBQ_U + VCVTQ_FROM_F_S + VCVTQ_FROM_F_U + VCVTPQ_S + VCVTPQ_U + VCVTNQ_S + VCVTNQ_U + VCVTMQ_S + VCVTMQ_U + VADDLVQ_U + VCTP8Q + VCTP16Q + VCTP32Q + VCTP64Q + VPNOT + VCREATEQ_F + VCVTQ_N_TO_F_S + VCVTQ_N_TO_F_U + VBRSRQ_N_F + VSUBQ_N_F + VCREATEQ_U + VCREATEQ_S + VSHRQ_N_S + VSHRQ_N_U + VCVTQ_N_FROM_F_S + VCVTQ_N_FROM_F_U + VADDLVQ_P_S + VADDLVQ_P_U + VCMPNEQ_U + VCMPNEQ_S + VSHLQ_S + VSHLQ_U + VABDQ_S + VADDQ_N_S + VADDVAQ_S + VADDVQ_P_S + VANDQ_S + VBICQ_S + VBRSRQ_N_S + VCADDQ_ROT270_S + VCADDQ_ROT90_S + VCMPEQQ_S + VCMPEQQ_N_S + VCMPNEQ_N_S + VEORQ_S + VHADDQ_S + VHADDQ_N_S + VHSUBQ_S + VHSUBQ_N_S + VMAXQ_S + VMAXVQ_S + VMINQ_S + VMINVQ_S + VMLADAVQ_S + VMULHQ_S + VMULLBQ_INT_S + VMULLTQ_INT_S + VMULQ_S + VMULQ_N_S + VORNQ_S + VORRQ_S + VQADDQ_S + VQADDQ_N_S + VQRSHLQ_S + VQRSHLQ_N_S + VQSHLQ_S + VQSHLQ_N_S + VQSHLQ_R_S + VQSUBQ_S + VQSUBQ_N_S + VRHADDQ_S + VRMULHQ_S + VRSHLQ_S + VRSHLQ_N_S + VRSHRQ_N_S + VSHLQ_N_S + VSHLQ_R_S + VSUBQ_S + VSUBQ_N_S + VABDQ_U + VADDQ_N_U + VADDVAQ_U + VADDVQ_P_U + VANDQ_U + VBICQ_U + VBRSRQ_N_U + VCADDQ_ROT270_U + VCADDQ_ROT90_U + VCMPEQQ_U + VCMPEQQ_N_U + VCMPNEQ_N_U + VEORQ_U + VHADDQ_U + VHADDQ_N_U + VHSUBQ_U + VHSUBQ_N_U + VMAXQ_U + VMAXVQ_U + VMINQ_U + VMINVQ_U + VMLADAVQ_U + VMULHQ_U + VMULLBQ_INT_U + VMULLTQ_INT_U + VMULQ_U + VMULQ_N_U + VORNQ_U + VORRQ_U + VQADDQ_U + VQADDQ_N_U + VQRSHLQ_U + VQRSHLQ_N_U + VQSHLQ_U + VQSHLQ_N_U + VQSHLQ_R_U + VQSUBQ_U + VQSUBQ_N_U + VRHADDQ_U + VRMULHQ_U + VRSHLQ_U + VRSHLQ_N_U + VRSHRQ_N_U + VSHLQ_N_U + VSHLQ_R_U + VSUBQ_U + VSUBQ_N_U + VCMPGEQ_N_S + VCMPGEQ_S + VCMPGTQ_N_S + VCMPGTQ_S + VCMPLEQ_N_S + VCMPLEQ_S + VCMPLTQ_N_S + VCMPLTQ_S + VHCADDQ_ROT270_S + VHCADDQ_ROT90_S + VMAXAQ_S + VMAXAVQ_S + VMINAQ_S + VMINAVQ_S + VMLADAVXQ_S + VMLSDAVQ_S + VMLSDAVXQ_S + VQDMULHQ_N_S + VQDMULHQ_S + VQRDMULHQ_N_S + VQRDMULHQ_S + VQSHLUQ_N_S + VCMPCSQ_N_U + VCMPCSQ_U + VCMPHIQ_N_U + VCMPHIQ_U + VABDQ_M_S + VABDQ_M_U + VABDQ_F + VADDQ_N_F + VANDQ_F + VBICQ_F + VCADDQ_ROT270_F + VCADDQ_ROT90_F + VCMPEQQ_F + VCMPEQQ_N_F + VCMPGEQ_F + VCMPGEQ_N_F + VCMPGTQ_F + VCMPGTQ_N_F + VCMPLEQ_F + VCMPLEQ_N_F + VCMPLTQ_F + VCMPLTQ_N_F + VCMPNEQ_F + VCMPNEQ_N_F + VCMULQ_F + VCMULQ_ROT180_F + VCMULQ_ROT270_F + VCMULQ_ROT90_F + VEORQ_F + VMAXNMAQ_F + VMAXNMAVQ_F + VMAXNMQ_F + VMAXNMVQ_F + VMINNMAQ_F + VMINNMAVQ_F + VMINNMQ_F + VMINNMVQ_F + VMULQ_F + VMULQ_N_F + VORNQ_F + VORRQ_F + VSUBQ_F + VADDLVAQ_U + VADDLVAQ_S + VBICQ_N_U + VBICQ_N_S + VCTP8Q_M + VCTP16Q_M + VCTP32Q_M + VCTP64Q_M + VCVTBQ_F16_F32 + VCVTTQ_F16_F32 + VMLALDAVQ_U + VMLALDAVXQ_U + VMLALDAVXQ_S + VMLALDAVQ_S + VMLSLDAVQ_S + VMLSLDAVXQ_S + VMOVNBQ_U + VMOVNBQ_S + VMOVNTQ_U + VMOVNTQ_S + VORRQ_N_S + VORRQ_N_U + VQDMULLBQ_N_S + VQDMULLBQ_S + VQDMULLTQ_N_S + VQDMULLTQ_S + VQMOVNBQ_U + VQMOVNBQ_S + VQMOVUNBQ_S + VQMOVUNTQ_S + VRMLALDAVHXQ_S + VRMLSLDAVHQ_S + VRMLSLDAVHXQ_S + VSHLLBQ_S + VSHLLBQ_U + VSHLLTQ_U + VSHLLTQ_S + VQMOVNTQ_U + VQMOVNTQ_S + VSHLLBQ_N_S + VSHLLBQ_N_U + VSHLLTQ_N_U + VSHLLTQ_N_S + VRMLALDAVHQ_U + VRMLALDAVHQ_S + VMULLTQ_POLY_P + VMULLBQ_POLY_P + VBICQ_M_N_S + VBICQ_M_N_U + VCMPEQQ_M_F + VCVTAQ_M_S + VCVTAQ_M_U + VCVTQ_M_TO_F_S + VCVTQ_M_TO_F_U + VQRSHRNBQ_N_U + VQRSHRNBQ_N_S + VQRSHRUNBQ_N_S + VRMLALDAVHAQ_S + VABAVQ_S + VABAVQ_U + VSHLCQ_S + VSHLCQ_U + VRMLALDAVHAQ_U + VABSQ_M_S + VADDVAQ_P_S + VADDVAQ_P_U + VCLSQ_M_S + VCLZQ_M_S + VCLZQ_M_U + VCMPCSQ_M_N_U + VCMPCSQ_M_U + VCMPEQQ_M_N_S + VCMPEQQ_M_N_U + VCMPEQQ_M_S + VCMPEQQ_M_U + VCMPGEQ_M_N_S + VCMPGEQ_M_S + VCMPGTQ_M_N_S + VCMPGTQ_M_S + VCMPHIQ_M_N_U + VCMPHIQ_M_U + VCMPLEQ_M_N_S + VCMPLEQ_M_S + VCMPLTQ_M_N_S + VCMPLTQ_M_S + VCMPNEQ_M_N_S + VCMPNEQ_M_N_U + VCMPNEQ_M_S + VCMPNEQ_M_U + VDUPQ_M_N_S + VDUPQ_M_N_U + VDWDUPQ_N_U + VDWDUPQ_WB_U + VIWDUPQ_N_U + VIWDUPQ_WB_U + VMAXAQ_M_S + VMAXAVQ_P_S + VMAXVQ_P_S + VMAXVQ_P_U + VMINAQ_M_S + VMINAVQ_P_S + VMINVQ_P_S + VMINVQ_P_U + VMLADAVAQ_S + VMLADAVAQ_U + VMLADAVQ_P_S + VMLADAVQ_P_U + VMLADAVXQ_P_S + VMLAQ_N_S + VMLAQ_N_U + VMLASQ_N_S + VMLASQ_N_U + VMLSDAVQ_P_S + VMLSDAVXQ_P_S + VMVNQ_M_S + VMVNQ_M_U + VNEGQ_M_S + VPSELQ_S + VPSELQ_U + VQABSQ_M_S + VQDMLAHQ_N_S + VQDMLASHQ_N_S + VQNEGQ_M_S + VQRDMLADHQ_S + VQRDMLADHXQ_S + VQRDMLAHQ_N_S + VQRDMLASHQ_N_S + VQRDMLSDHQ_S + VQRDMLSDHXQ_S + VQRSHLQ_M_N_S + VQRSHLQ_M_N_U + VQSHLQ_M_R_S + VQSHLQ_M_R_U + VREV64Q_M_S + VREV64Q_M_U + VRSHLQ_M_N_S + VRSHLQ_M_N_U + VSHLQ_M_R_S + VSHLQ_M_R_U + VSLIQ_N_S + VSLIQ_N_U + VSRIQ_N_S + VSRIQ_N_U + VQDMLSDHXQ_S + VQDMLSDHQ_S + VQDMLADHXQ_S + VQDMLADHQ_S + VMLSDAVAXQ_S + VMLSDAVAQ_S + VMLADAVAXQ_S + VCMPGEQ_M_F + VCMPGTQ_M_N_F + VMLSLDAVQ_P_S + VRMLALDAVHAXQ_S + VMLSLDAVXQ_P_S + VFMAQ_F + VMLSLDAVAQ_S + VQSHRUNBQ_N_S + VQRSHRUNTQ_N_S + VCMLAQ_F + VMINNMAQ_M_F + VFMASQ_N_F + VDUPQ_M_N_F + VCMPGTQ_M_F + VCMPLTQ_M_F + VRMLSLDAVHQ_P_S + VQSHRUNTQ_N_S + VABSQ_M_F + VMAXNMAVQ_P_F + VFMAQ_N_F + VRMLSLDAVHXQ_P_S + VREV32Q_M_F + VRMLSLDAVHAQ_S + VRMLSLDAVHAXQ_S + VCMPLTQ_M_N_F + VCMPNEQ_M_F + VRNDAQ_M_F + VRNDPQ_M_F + VADDLVAQ_P_S + VQMOVUNBQ_M_S + VCMPLEQ_M_F + VCMLAQ_ROT180_F + VMLSLDAVAXQ_S + VRNDXQ_M_F + VFMSQ_F + VMINNMVQ_P_F + VMAXNMVQ_P_F + VPSELQ_F + VCMLAQ_ROT90_F + VQMOVUNTQ_M_S + VREV64Q_M_F + VNEGQ_M_F + VRNDMQ_M_F + VCMPLEQ_M_N_F + VCMPGEQ_M_N_F + VRNDNQ_M_F + VMINNMAVQ_P_F + VCMPNEQ_M_N_F + VRMLALDAVHQ_P_S + VRMLALDAVHXQ_P_S + VCMPEQQ_M_N_F + VCMLAQ_ROT270_F + VMAXNMAQ_M_F + VRNDQ_M_F + VMLALDAVQ_P_U + VMLALDAVQ_P_S + VQMOVNBQ_M_S + VQMOVNBQ_M_U + VMOVLTQ_M_U + VMOVLTQ_M_S + VMOVNBQ_M_U + VMOVNBQ_M_S + VRSHRNTQ_N_U + VRSHRNTQ_N_S + VORRQ_M_N_S + VORRQ_M_N_U + VREV32Q_M_S + VREV32Q_M_U + VQRSHRNTQ_N_U + VQRSHRNTQ_N_S + VMOVNTQ_M_U + VMOVNTQ_M_S + VMOVLBQ_M_U + VMOVLBQ_M_S + VMLALDAVAQ_S + VMLALDAVAQ_U + VQSHRNBQ_N_U + VQSHRNBQ_N_S + VSHRNBQ_N_U + VSHRNBQ_N_S + VRSHRNBQ_N_S + VRSHRNBQ_N_U + VMLALDAVXQ_P_U + VMLALDAVXQ_P_S + VQMOVNTQ_M_U + VQMOVNTQ_M_S + VMVNQ_M_N_U + VMVNQ_M_N_S + VQSHRNTQ_N_U + VQSHRNTQ_N_S + VMLALDAVAXQ_S + VMLALDAVAXQ_U + VSHRNTQ_N_S + VSHRNTQ_N_U + VCVTBQ_M_F16_F32 + VCVTBQ_M_F32_F16 + VCVTTQ_M_F16_F32 + VCVTTQ_M_F32_F16 + VCVTMQ_M_S + VCVTMQ_M_U + VCVTNQ_M_S + VCVTPQ_M_S + VCVTPQ_M_U + VCVTQ_M_N_FROM_F_S + VCVTNQ_M_U + VREV16Q_M_S + VREV16Q_M_U + VREV32Q_M + VCVTQ_M_FROM_F_U + VCVTQ_M_FROM_F_S + VRMLALDAVHQ_P_U + VADDLVAQ_P_U + VCVTQ_M_N_FROM_F_U + VQSHLUQ_M_N_S + VABAVQ_P_S + VABAVQ_P_U + VSHLQ_M_S + VSHLQ_M_U + VSRIQ_M_N_S + VSRIQ_M_N_U + VSUBQ_M_U + VSUBQ_M_S + VCVTQ_M_N_TO_F_U + VCVTQ_M_N_TO_F_S + VQADDQ_M_U + VQADDQ_M_S + VRSHRQ_M_N_S + VSUBQ_M_N_S + VSUBQ_M_N_U + VBRSRQ_M_N_S + VSUBQ_M_N_F + VBICQ_M_F + VHADDQ_M_U + VBICQ_M_U + VBICQ_M_S + VMULQ_M_N_U + VHADDQ_M_S + VORNQ_M_F + VMLAQ_M_N_S + VQSUBQ_M_U + VQSUBQ_M_S + VMLAQ_M_N_U + VQSUBQ_M_N_U + VQSUBQ_M_N_S + VMULLTQ_INT_M_S + VMULLTQ_INT_M_U + VMULQ_M_N_S + VMULQ_M_N_F + VMLASQ_M_N_U + VMLASQ_M_N_S + VMAXQ_M_U + VQRDMLAHQ_M_N_U + VCADDQ_ROT270_M_F + VCADDQ_ROT270_M_U + VCADDQ_ROT270_M_S + VQRSHLQ_M_S + VMULQ_M_F + VRHADDQ_M_U + VSHRQ_M_N_U + VRHADDQ_M_S + VMULQ_M_S + VMULQ_M_U + VQDMLASHQ_M_N_S + VQRDMLASHQ_M_N_S + VRSHLQ_M_S + VRSHLQ_M_U + VRSHRQ_M_N_U + VADDQ_M_N_F + VADDQ_M_N_S + VADDQ_M_N_U + VQRDMLASHQ_M_N_U + VMAXQ_M_S + VQRDMLAHQ_M_N_S + VORRQ_M_S + VORRQ_M_U + VORRQ_M_F + VQRSHLQ_M_U + VRMULHQ_M_U + VRMULHQ_M_S + VMINQ_M_S + VMINQ_M_U + VANDQ_M_F + VANDQ_M_U + VANDQ_M_S + VHSUBQ_M_N_S + VHSUBQ_M_N_U + VMULHQ_M_S + VMULHQ_M_U + VMULLBQ_INT_M_U + VMULLBQ_INT_M_S + VCADDQ_ROT90_M_F + VSHRQ_M_N_S + VADDQ_M_U + VSLIQ_M_N_U + VQADDQ_M_N_S + VBRSRQ_M_N_F + VABDQ_M_F + VBRSRQ_M_N_U + VEORQ_M_F + VSHLQ_M_N_S + VQDMLAHQ_M_N_U + VQDMLAHQ_M_N_S + VSHLQ_M_N_U + VMLADAVAQ_P_U + VMLADAVAQ_P_S + VSLIQ_M_N_S + VQSHLQ_M_U + VQSHLQ_M_S + VCADDQ_ROT90_M_U + VCADDQ_ROT90_M_S + VORNQ_M_U + VORNQ_M_S + VQSHLQ_M_N_S + VQSHLQ_M_N_U + VADDQ_M_S + VHADDQ_M_N_S + VADDQ_M_F + VQADDQ_M_N_U + VEORQ_M_S + VEORQ_M_U + VHSUBQ_M_S + VHSUBQ_M_U + VHADDQ_M_N_U + VHCADDQ_ROT90_M_S + VQRDMLSDHQ_M_S + VQRDMLSDHXQ_M_S + VQRDMLADHXQ_M_S + VQDMULHQ_M_S + VMLADAVAXQ_P_S + VQDMLADHXQ_M_S + VQRDMULHQ_M_S + VMLSDAVAXQ_P_S + VQDMULHQ_M_N_S + VHCADDQ_ROT270_M_S + VQDMLSDHQ_M_S + VQDMLSDHXQ_M_S + VMLSDAVAQ_P_S + VQRDMLADHQ_M_S + VQDMLADHQ_M_S + VMLALDAVAQ_P_U + VMLALDAVAQ_P_S + VQRSHRNBQ_M_N_U + VQRSHRNBQ_M_N_S + VQRSHRNTQ_M_N_S + VQSHRNBQ_M_N_U + VQSHRNBQ_M_N_S + VQSHRNTQ_M_N_S + VRSHRNBQ_M_N_U + VRSHRNBQ_M_N_S + VRSHRNTQ_M_N_U + VSHLLBQ_M_N_U + VSHLLBQ_M_N_S + VSHLLTQ_M_N_U + VSHLLTQ_M_N_S + VSHRNBQ_M_N_S + VSHRNBQ_M_N_U + VSHRNTQ_M_N_S + VSHRNTQ_M_N_U + VMLALDAVAXQ_P_S + VQRSHRNTQ_M_N_U + VQSHRNTQ_M_N_U + VRSHRNTQ_M_N_S + VQRDMULHQ_M_N_S + VRMLALDAVHAQ_P_S + VMLSLDAVAQ_P_S + VMLSLDAVAXQ_P_S + VMULLBQ_POLY_M_P + VMULLTQ_POLY_M_P + VQDMULLBQ_M_N_S + VQDMULLBQ_M_S + VQDMULLTQ_M_N_S + VQDMULLTQ_M_S + VQRSHRUNBQ_M_N_S + VQSHRUNBQ_M_N_S + VQSHRUNTQ_M_N_S + VRMLALDAVHAQ_P_U + VRMLALDAVHAXQ_P_S + VRMLSLDAVHAQ_P_S + VRMLSLDAVHAXQ_P_S + VQRSHRUNTQ_M_N_S + VCMLAQ_M_F + VCMLAQ_ROT180_M_F + VCMLAQ_ROT270_M_F + VCMLAQ_ROT90_M_F + VCMULQ_M_F + VCMULQ_ROT180_M_F + VCMULQ_ROT270_M_F + VCMULQ_ROT90_M_F + VFMAQ_M_F + VFMAQ_M_N_F + VFMASQ_M_N_F + VFMSQ_M_F + VMAXNMQ_M_F + VMINNMQ_M_F + VSUBQ_M_F + VSTRWQSB_S + VSTRWQSB_U + VSTRBQSO_S + VSTRBQSO_U + VSTRBQ_S + VSTRBQ_U + VLDRBQGO_S + VLDRBQGO_U + VLDRBQ_S + VLDRBQ_U + VLDRWQGB_S + VLDRWQGB_U + VLD1Q_F + VLD1Q_S + VLD1Q_U + VLDRHQ_F + VLDRHQGO_S + VLDRHQGO_U + VLDRHQGSO_S + VLDRHQGSO_U + VLDRHQ_S + VLDRHQ_U + VLDRWQ_F + VLDRWQ_S + VLDRWQ_U + VLDRDQGB_S + VLDRDQGB_U + VLDRDQGO_S + VLDRDQGO_U + VLDRDQGSO_S + VLDRDQGSO_U + VLDRHQGO_F + VLDRHQGSO_F + VLDRWQGB_F + VLDRWQGO_F + VLDRWQGO_S + VLDRWQGO_U + VLDRWQGSO_F + VLDRWQGSO_S + VLDRWQGSO_U + VSTRHQ_F + VST1Q_S + VST1Q_U + VSTRHQSO_S + VSTRHQ_U + VSTRWQ_S + VSTRWQ_U + VSTRWQ_F + VST1Q_F + VSTRDQSB_S + VSTRDQSB_U + VSTRDQSO_S + VSTRDQSO_U + VSTRDQSSO_S + VSTRDQSSO_U + VSTRWQSO_S + VSTRWQSO_U + VSTRWQSSO_S + VSTRWQSSO_U + VSTRHQSO_F + VSTRHQSSO_F + VSTRWQSB_F + VSTRWQSO_F + VSTRWQSSO_F + VDDUPQ + VDDUPQ_M + VDWDUPQ + VDWDUPQ_M + VIDUPQ + VIDUPQ_M + VIWDUPQ + VIWDUPQ_M + VSTRWQSBWB_S + VSTRWQSBWB_U + VLDRWQGBWB_S + VLDRWQGBWB_U + VSTRWQSBWB_F + VLDRWQGBWB_F + VSTRDQSBWB_S + VSTRDQSBWB_U + VLDRDQGBWB_S + VLDRDQGBWB_U + VADCQ_U + VADCQ_M_U + VADCQ_S + VADCQ_M_S + VSBCIQ_U + VSBCIQ_S + VSBCIQ_M_U + VSBCIQ_M_S + VSBCQ_U + VSBCQ_S + VSBCQ_M_U + VSBCQ_M_S + VADCIQ_U + VADCIQ_M_U + VADCIQ_S + VADCIQ_M_S + VLD2Q + VLD4Q + VST2Q + VSHLCQ_M_U + VSHLCQ_M_S + VSTRHQSO_U + VSTRHQSSO_S + VSTRHQSSO_U + VSTRHQ_S + SRSHRL + SRSHR + URSHR + URSHRL + SQRSHR + UQRSHL + UQRSHLL_64 + UQRSHLL_48 + SQRSHRL_64 + SQRSHRL_48 + VSHLCQ_M_ +]) diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index b7e3619..250e503 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -81,104 +81,53 @@ ;; patterns separately for Neon, IWMMXT and MVE. (define_expand "add<mode>3" - [(set (match_operand:VNIM 0 "s_register_operand") - (plus:VNIM (match_operand:VNIM 1 "s_register_operand") - (match_operand:VNIM 2 "s_register_operand")))] - "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode) - || flag_unsafe_math_optimizations)) - || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode)) - || (TARGET_HAVE_MVE && VALID_MVE_SI_MODE(<MODE>mode)) - || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE(<MODE>mode))" -{ -}) - -;; Vector arithmetic. Expanders are blank, then unnamed insns implement -;; patterns separately for Neon and MVE. - -(define_expand "addv8hf3" - [(set (match_operand:V8HF 0 "s_register_operand") - (plus:V8HF (match_operand:V8HF 1 "s_register_operand") - (match_operand:V8HF 2 "s_register_operand")))] - "(TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE(V8HFmode)) - || (TARGET_NEON_FP16INST && flag_unsafe_math_optimizations)" -{ - if (TARGET_NEON_FP16INST && flag_unsafe_math_optimizations) - emit_insn (gen_addv8hf3_neon (operands[0], operands[1], operands[2])); -}) - -;; Vector arithmetic. Expanders are blank, then unnamed insns implement -;; patterns separately for Neon and IWMMXT. - -(define_expand "add<mode>3" - [(set (match_operand:VNINOTM 0 "s_register_operand") - (plus:VNINOTM (match_operand:VNINOTM 1 "s_register_operand") - (match_operand:VNINOTM 2 "s_register_operand")))] - "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode) - || flag_unsafe_math_optimizations)) - || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" -{ -}) - -;; Vector arithmetic. Expanders are blank, then unnamed insns implement -;; patterns separately for IWMMXT and Neon. + [(set (match_operand:VDQ 0 "s_register_operand") + (plus:VDQ (match_operand:VDQ 1 "s_register_operand") + (match_operand:VDQ 2 "s_register_operand")))] + "ARM_HAVE_<MODE>_ARITH" +) (define_expand "sub<mode>3" - [(set (match_operand:VALL 0 "s_register_operand") - (minus:VALL (match_operand:VALL 1 "s_register_operand") - (match_operand:VALL 2 "s_register_operand")))] - "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode) - || flag_unsafe_math_optimizations)) - || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" -{ -}) + [(set (match_operand:VDQ 0 "s_register_operand") + (minus:VDQ (match_operand:VDQ 1 "s_register_operand") + (match_operand:VDQ 2 "s_register_operand")))] + "ARM_HAVE_<MODE>_ARITH" +) (define_expand "mul<mode>3" - [(set (match_operand:VALLW 0 "s_register_operand") - (mult:VALLW (match_operand:VALLW 1 "s_register_operand") - (match_operand:VALLW 2 "s_register_operand")))] - "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode) - || flag_unsafe_math_optimizations)) - || (<MODE>mode == V4HImode && TARGET_REALLY_IWMMXT)" -{ -}) + [(set (match_operand:VDQWH 0 "s_register_operand") + (mult:VDQWH (match_operand:VDQWH 1 "s_register_operand") + (match_operand:VDQWH 2 "s_register_operand")))] + "ARM_HAVE_<MODE>_ARITH" +) (define_expand "smin<mode>3" [(set (match_operand:VALLW 0 "s_register_operand") (smin:VALLW (match_operand:VALLW 1 "s_register_operand") (match_operand:VALLW 2 "s_register_operand")))] - "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode) - || flag_unsafe_math_optimizations)) - || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" -{ -}) + "ARM_HAVE_<MODE>_ARITH" +) (define_expand "umin<mode>3" [(set (match_operand:VINTW 0 "s_register_operand") (umin:VINTW (match_operand:VINTW 1 "s_register_operand") (match_operand:VINTW 2 "s_register_operand")))] - "TARGET_NEON - || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" -{ -}) + "ARM_HAVE_<MODE>_ARITH" +) (define_expand "smax<mode>3" [(set (match_operand:VALLW 0 "s_register_operand") (smax:VALLW (match_operand:VALLW 1 "s_register_operand") (match_operand:VALLW 2 "s_register_operand")))] - "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode) - || flag_unsafe_math_optimizations)) - || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" -{ -}) + "ARM_HAVE_<MODE>_ARITH" +) (define_expand "umax<mode>3" [(set (match_operand:VINTW 0 "s_register_operand") (umax:VINTW (match_operand:VINTW 1 "s_register_operand") (match_operand:VINTW 2 "s_register_operand")))] - "TARGET_NEON - || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" -{ -}) + "ARM_HAVE_<MODE>_ARITH" +) (define_expand "vec_perm<mode>" [(match_operand:VE 0 "s_register_operand") diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index 6a2bc5a..e6c287c 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -387,31 +387,15 @@ (set_attr "arch" "t2,any,any,any,a,t2,any,any,any,any,any,any")] ) -(define_insn "*mov_load_vfp_hf16" - [(set (match_operand:HF 0 "s_register_operand" "=t") - (match_operand:HF 1 "memory_operand" "Uj"))] - "TARGET_HAVE_MVE_FLOAT" - "vldr.16\\t%0, %E1" -) - -(define_insn "*mov_store_vfp_hf16" - [(set (match_operand:HF 0 "memory_operand" "=Uj") - (match_operand:HF 1 "s_register_operand" "t"))] - "TARGET_HAVE_MVE_FLOAT" - "vstr.16\\t%1, %E0" -) - ;; HFmode and BFmode moves (define_insn "*mov<mode>_vfp_<mode>16" [(set (match_operand:HFBF 0 "nonimmediate_operand" - "= ?r,?m,t,r,t,r,t, t, Um,r") + "= ?r,?m,t,r,t,r,t, t, Uj,r") (match_operand:HFBF 1 "general_operand" - " m,r,t,r,r,t,Dv,Um,t, F"))] + " m,r,t,r,r,t,Dv,Uj,t, F"))] "TARGET_32BIT - && TARGET_VFP_FP16INST - && arm_mve_mode_and_operands_type_check (<MODE>mode, operands[0], - operands[1]) + && (TARGET_VFP_FP16INST || TARGET_HAVE_MVE) && (s_register_operand (operands[0], <MODE>mode) || s_register_operand (operands[1], <MODE>mode))" { @@ -430,9 +414,15 @@ case 6: /* S register from immediate. */ return \"vmov.f16\\t%0, %1\t%@ __<fporbf>\"; case 7: /* S register from memory. */ - return \"vld1.16\\t{%z0}, %A1\"; + if (TARGET_HAVE_MVE) + return \"vldr.16\\t%0, %1\"; + else + return \"vld1.16\\t{%z0}, %A1\"; case 8: /* Memory from S register. */ - return \"vst1.16\\t{%z1}, %A0\"; + if (TARGET_HAVE_MVE) + return \"vstr.16\\t%1, %0\"; + else + return \"vst1.16\\t{%z1}, %A0\"; case 9: /* ARM register from constant. */ { long bits; @@ -2135,7 +2125,7 @@ (match_operand:DF 1 "const_double_operand" "F")) (clobber (match_operand:DF 2 "s_register_operand" "=r"))] "arm_disable_literal_pool - && TARGET_HARD_FLOAT + && TARGET_VFP_BASE && !arm_const_double_rtx (operands[1]) && !(TARGET_VFP_DOUBLE && vfp3_const_double_rtx (operands[1]))" "#" @@ -2161,7 +2151,7 @@ (match_operand:SF 1 "const_double_operand" "E")) (clobber (match_operand:SF 2 "s_register_operand" "=r"))] "arm_disable_literal_pool - && TARGET_HARD_FLOAT + && TARGET_VFP_BASE && !vfp3_const_double_rtx (operands[1])" "#" "" diff --git a/gcc/config/arm/vxworks.h b/gcc/config/arm/vxworks.h index 2ebfce8..487ec0f 100644 --- a/gcc/config/arm/vxworks.h +++ b/gcc/config/arm/vxworks.h @@ -44,7 +44,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see \ if (arm_arch_xscale) \ builtin_define ("_VX_CPU=XSCALE"); \ - if (arm_arch8) \ + else if (arm_arch8) \ builtin_define ("_VX_CPU=ARMARCH8A"); \ else if (arm_arch7) \ { \ diff --git a/gcc/config/bpf/bpf.md b/gcc/config/bpf/bpf.md index 769d8ea..8e7cf50 100644 --- a/gcc/config/bpf/bpf.md +++ b/gcc/config/bpf/bpf.md @@ -165,6 +165,16 @@ "div<msuffix>\t%0,%2" [(set_attr "type" "<mtype>")]) +;; However, xBPF does provide a signed division operator, sdiv. + +(define_insn "div<AM:mode>3" + [(set (match_operand:AM 0 "register_operand" "=r,r") + (div:AM (match_operand:AM 1 "register_operand" " 0,0") + (match_operand:AM 2 "reg_or_imm_operand" "r,I")))] + "TARGET_XBPF" + "sdiv<msuffix>\t%0,%2" + [(set_attr "type" "<mtype>")]) + ;;; Modulus ;; Note that eBPF doesn't provide instructions for signed integer @@ -178,6 +188,16 @@ "mod<msuffix>\t%0,%2" [(set_attr "type" "<mtype>")]) +;; Again, xBPF provides a signed version, smod. + +(define_insn "mod<AM:mode>3" + [(set (match_operand:AM 0 "register_operand" "=r,r") + (mod:AM (match_operand:AM 1 "register_operand" " 0,0") + (match_operand:AM 2 "reg_or_imm_operand" "r,I")))] + "TARGET_XBPF" + "smod<msuffix>\t%0,%2" + [(set_attr "type" "<mtype>")]) + ;;; Logical AND (define_insn "and<AM:mode>3" [(set (match_operand:AM 0 "register_operand" "=r,r") diff --git a/gcc/config/darwin-protos.h b/gcc/config/darwin-protos.h index 54cd1e4..49c540f 100644 --- a/gcc/config/darwin-protos.h +++ b/gcc/config/darwin-protos.h @@ -125,6 +125,6 @@ extern bool darwin_kextabi_p (void); extern void darwin_override_options (void); extern void darwin_patch_builtins (void); extern void darwin_rename_builtins (void); -extern bool darwin_libc_has_function (enum function_class fn_class); +extern bool darwin_libc_has_function (enum function_class fn_class, tree); #endif /* CONFIG_DARWIN_PROTOS_H */ diff --git a/gcc/config/darwin-sections.def b/gcc/config/darwin-sections.def index 98677f6..65bf5ad 100644 --- a/gcc/config/darwin-sections.def +++ b/gcc/config/darwin-sections.def @@ -198,3 +198,18 @@ DEF_SECTION (objc2_image_info_section, 0, ".section __DATA, __objc_imageinfo, regular, no_dead_strip", 1) DEF_SECTION (objc2_constant_string_object_section, 0, ".section __DATA, __objc_stringobj, regular, no_dead_strip", 1) + +/* Additions for compatibility with later runtime conventions especially for + sections containing strings. */ +DEF_SECTION (objc2_data_section, 0, ".section __DATA, __data", 1) + +DEF_SECTION (objc2_ivar_section, 0, ".section __DATA, __objc_ivar", 1) + +DEF_SECTION (objc2_class_names_section, 0, + ".section __TEXT, __objc_classname, cstring_literals", 1) + +DEF_SECTION (objc2_method_names_section, 0, + ".section __TEXT, __objc_methname, cstring_literals", 1) + +DEF_SECTION (objc2_method_types_section, 0, + ".section __TEXT, __objc_methtype, cstring_literals", 1) diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index c8edfb8..dd4857f 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -136,7 +136,7 @@ output_objc_section_asm_op (const void *directive) order in the object. The code below implements this by emitting a section header for each ObjC section the first time that an ObjC section is requested. */ - if (! been_here) + if (darwin_symbol_stubs && ! been_here) { section *saved_in_section = in_section; static const enum darwin_section_enum tomark[] = @@ -174,20 +174,23 @@ output_objc_section_asm_op (const void *directive) /* ABI=2 */ static const enum darwin_section_enum tomarkv2[] = { + objc2_method_names_section, objc2_message_refs_section, + objc2_selector_refs_section, + objc2_ivar_section, objc2_classdefs_section, objc2_metadata_section, objc2_classrefs_section, + objc2_class_names_section, objc2_classlist_section, objc2_categorylist_section, - objc2_selector_refs_section, objc2_nonlazy_class_section, objc2_nonlazy_category_section, objc2_protocollist_section, objc2_protocolrefs_section, objc2_super_classrefs_section, + objc2_constant_string_object_section, objc2_image_info_section, - objc2_constant_string_object_section } ; size_t i; @@ -1436,7 +1439,7 @@ darwin_objc2_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base) gcc_assert (TREE_CODE (ident) == IDENTIFIER_NODE); p = IDENTIFIER_POINTER (ident); - gcc_checking_assert (flag_next_runtime == 1 && flag_objc_abi == 2); + gcc_checking_assert (flag_next_runtime >= 1 && flag_objc_abi == 2); objc_metadata_seen = 1; @@ -1447,11 +1450,20 @@ darwin_objc2_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base) first. */ if (!strncmp (p, "V2_BASE", 7)) return base; + else if (!strncmp (p, "V2_CNAM", 7)) + return darwin_sections[objc2_class_names_section]; + else if (!strncmp (p, "V2_MNAM", 7)) + return darwin_sections[objc2_method_names_section]; + else if (!strncmp (p, "V2_MTYP", 7)) + return darwin_sections[objc2_method_types_section]; else if (!strncmp (p, "V2_STRG", 7)) return darwin_sections[cstring_section]; else if (!strncmp (p, "G2_META", 7) || !strncmp (p, "G2_CLAS", 7)) return darwin_sections[objc2_classdefs_section]; + else if (!strncmp (p, "V2_PCOL", 7)) + return ld_uses_coal_sects ? darwin_sections[data_coal_section] + : darwin_sections[objc2_data_section]; else if (!strncmp (p, "V2_MREF", 7)) return darwin_sections[objc2_message_refs_section]; else if (!strncmp (p, "V2_CLRF", 7)) @@ -1487,6 +1499,9 @@ darwin_objc2_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base) else if (!strncmp (p, "V2_CSTR", 7)) return darwin_sections[objc2_constant_string_object_section]; + else if (!strncmp (p, "V2_IVRF", 7)) + return darwin_sections[objc2_ivar_section]; + /* Not recognized, default. */ return base; } @@ -1500,7 +1515,7 @@ darwin_objc1_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base) gcc_assert (TREE_CODE (ident) == IDENTIFIER_NODE); p = IDENTIFIER_POINTER (ident); - gcc_checking_assert (flag_next_runtime == 1 && flag_objc_abi < 2); + gcc_checking_assert (flag_next_runtime >= 1 && flag_objc_abi < 2); objc_metadata_seen = 1; @@ -1861,6 +1876,14 @@ darwin_globalize_label (FILE *stream, const char *name) { if (!!strncmp (name, "_OBJC_", 6)) default_globalize_label (stream, name); + /* We have some Objective C cases that need to be global, but only on newer + OS versions. */ + if (flag_objc_abi < 2 || flag_next_runtime < 100700) + return; + if (!strncmp (name+6, "LabelPro", 8)) + default_globalize_label (stream, name); + if (!strncmp (name+6, "Protocol_", 9)) + default_globalize_label (stream, name); } /* This routine returns non-zero if 'name' starts with the special objective-c @@ -1879,7 +1902,49 @@ darwin_label_is_anonymous_local_objc_name (const char *name) while (*p >= '0' && *p <= '9') p++; } - return (!strncmp ((const char *)p, "_OBJC_", 6)); + if (strncmp ((const char *)p, "_OBJC_", 6) != 0) + return false; + + /* We need some of the objective c meta-data symbols to be visible to the + linker (when the target OS version is newer). FIXME: this is horrible, + we need a better mechanism. */ + + if (flag_objc_abi < 2 || flag_next_runtime < 100700) + return true; + + p += 6; + if (!strncmp ((const char *)p, "ClassRef", 8)) + return false; + else if (!strncmp ((const char *)p, "SelRef", 6)) + return false; + else if (!strncmp ((const char *)p, "Category", 8)) + { + if (p[8] == '_' || p[8] == 'I' || p[8] == 'P' || p[8] == 'C' ) + return false; + return true; + } + else if (!strncmp ((const char *)p, "ClassMethods", 12)) + return false; + else if (!strncmp ((const char *)p, "Instance", 8)) + { + if (p[8] == 'I' || p[8] == 'M') + return false; + return true; + } + else if (!strncmp ((const char *)p, "CLASS_RO", 8)) + return false; + else if (!strncmp ((const char *)p, "METACLASS_RO", 12)) + return false; + else if (!strncmp ((const char *)p, "Protocol", 8)) + { + if (p[8] == '_' || p[8] == 'I' || p[8] == 'P' + || p[8] == 'M' || p[8] == 'C' || p[8] == 'O') + return false; + return true; + } + else if (!strncmp ((const char *)p, "LabelPro", 8)) + return false; + return true; } /* LTO support for Mach-O. @@ -2384,11 +2449,7 @@ darwin_emit_local_bss (FILE *fp, tree decl, const char *name, unsigned HOST_WIDE_INT size, unsigned int l2align) { - /* FIXME: We have a fudge to make this work with Java even when the target does - not use sections anchors -- Java seems to need at least one small item in a - non-zerofill segment. */ - if ((DARWIN_SECTION_ANCHORS && flag_section_anchors && size < BYTES_ZFILL) - || (size && size <= 2)) + if (DARWIN_SECTION_ANCHORS && flag_section_anchors && size < BYTES_ZFILL) { /* Put smaller objects in _static_data, where the section anchors system can get them. @@ -2414,16 +2475,13 @@ darwin_emit_local_bss (FILE *fp, tree decl, const char *name, } else { - /* When we are on a non-section anchor target, we can get zero-sized - items here. However, all we need to do is to bump them to one byte - and the section alignment will take care of the rest. */ + /* When we are on a non-section anchor target (or not using section + anchors, we can get zero-sized items here. However, all we need to + do is to bump them to one byte and the section alignment will take + care of the rest. */ char secnam[64]; - unsigned int flags ; - snprintf (secnam, 64, "__DATA,__%sbss%u", ((size)?"":"zo_"), - (unsigned) l2align); - /* We can't anchor (yet, if ever) in zerofill sections, because we can't - switch to them and emit a label. */ - flags = SECTION_BSS|SECTION_WRITE|SECTION_NO_ANCHOR; + snprintf (secnam, 64, "__DATA,__bss"); + unsigned int flags = SECTION_BSS|SECTION_WRITE|SECTION_NO_ANCHOR; in_section = get_section (secnam, flags, NULL); fprintf (fp, "\t.zerofill %s,", secnam); assemble_name (fp, name); @@ -2434,7 +2492,7 @@ darwin_emit_local_bss (FILE *fp, tree decl, const char *name, fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED",%u\n", size, (unsigned) l2align); else - fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED"\n", size); + fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED",0\n", size); } (*targetm.encode_section_info) (decl, DECL_RTL (decl), false); @@ -2559,9 +2617,8 @@ fprintf (fp, "# albss: %s (%lld,%d) ro %d cst %d stat %d com %d" return; } - /* So we have a public symbol (small item fudge for Java, see above). */ - if ((DARWIN_SECTION_ANCHORS && flag_section_anchors && size < BYTES_ZFILL) - || (size && size <= 2)) + /* So we have a public symbol. */ + if (DARWIN_SECTION_ANCHORS && flag_section_anchors && size < BYTES_ZFILL) { /* Put smaller objects in data, where the section anchors system can get them. However, if they are zero-sized punt them to yet a different @@ -2586,16 +2643,10 @@ fprintf (fp, "# albss: %s (%lld,%d) ro %d cst %d stat %d com %d" } else { + /* Section anchors not in use. */ + unsigned int flags = SECTION_BSS|SECTION_WRITE|SECTION_NO_ANCHOR; char secnam[64]; - unsigned int flags ; - /* When we are on a non-section anchor target, we can get zero-sized - items here. However, all we need to do is to bump them to one byte - and the section alignment will take care of the rest. */ - snprintf (secnam, 64, "__DATA,__%spu_bss%u", ((size)?"":"zo_"), l2align); - - /* We can't anchor in zerofill sections, because we can't switch - to them and emit a label. */ - flags = SECTION_BSS|SECTION_WRITE|SECTION_NO_ANCHOR; + snprintf (secnam, 64, "__DATA,__common"); in_section = get_section (secnam, flags, NULL); fprintf (fp, "\t.zerofill %s,", secnam); assemble_name (fp, name); @@ -2605,7 +2656,7 @@ fprintf (fp, "# albss: %s (%lld,%d) ro %d cst %d stat %d com %d" if (l2align) fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED",%u\n", size, l2align); else - fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED"\n", size); + fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED",0\n", size); } (* targetm.encode_section_info) (decl, DECL_RTL (decl), false); } @@ -3141,10 +3192,14 @@ darwin_override_options (void) /* Keep track of which (major) version we're generating code for. */ if (darwin_macosx_version_min) { - if (strverscmp (darwin_macosx_version_min, "10.6") >= 0) + if (strverscmp (darwin_macosx_version_min, "10.7") >= 0) + generating_for_darwin_version = 11; + else if (strverscmp (darwin_macosx_version_min, "10.6") >= 0) generating_for_darwin_version = 10; else if (strverscmp (darwin_macosx_version_min, "10.5") >= 0) generating_for_darwin_version = 9; + else if (strverscmp (darwin_macosx_version_min, "10.4") >= 0) + generating_for_darwin_version = 8; /* Earlier versions are not specifically accounted, until required. */ } @@ -3160,6 +3215,20 @@ darwin_override_options (void) should check for correctness re. the ABI. TODO: check and provide the flags (runtime & ABI) from the lto wrapper). */ + /* At present, make a hard update to the runtime version based on the target + OS version. */ + if (flag_next_runtime) + { + if (generating_for_darwin_version > 10) + flag_next_runtime = 100705; + else if (generating_for_darwin_version > 9) + flag_next_runtime = 100608; + else if (generating_for_darwin_version > 8) + flag_next_runtime = 100508; + else + flag_next_runtime = 100000; + } + /* Unless set, force ABI=2 for NeXT and m64, 0 otherwise. */ if (!global_options_set.x_flag_objc_abi) global_options.x_flag_objc_abi @@ -3542,7 +3611,8 @@ darwin_rename_builtins (void) } bool -darwin_libc_has_function (enum function_class fn_class) +darwin_libc_has_function (enum function_class fn_class, + tree type ATTRIBUTE_UNUSED) { if (fn_class == function_sincos) return (strverscmp (darwin_macosx_version_min, "10.9") >= 0); diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index 55a5361..f9d4fec 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -107,7 +107,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see /* Default to using the NeXT-style runtime, since that's what is pre-installed on Darwin systems. */ -#define NEXT_OBJC_RUNTIME 1 +#define NEXT_OBJC_RUNTIME 100508 /* Don't default to pcc-struct-return, because gcc is the only compiler, and we want to retain compatibility with older gcc versions. */ @@ -476,6 +476,7 @@ extern GTY(()) int darwin_ms_struct; debugging data. */ #define ASM_DEBUG_SPEC "%{g*:%{%:debug-level-gt(0):%{!gdwarf*:--gstabs}}}" +#define ASM_DEBUG_OPTION_SPEC "" #define ASM_FINAL_SPEC \ "%{gsplit-dwarf:%ngsplit-dwarf is not supported on this platform} %<gsplit-dwarf" diff --git a/gcc/config/darwin9.h b/gcc/config/darwin9.h index b7bdf63..787aca7 100644 --- a/gcc/config/darwin9.h +++ b/gcc/config/darwin9.h @@ -41,6 +41,9 @@ along with GCC; see the file COPYING3. If not see #undef ASM_DEBUG_SPEC #define ASM_DEBUG_SPEC "%{g*:%{%:debug-level-gt(0):%{gstabs:--gstabs}}}" +#undef ASM_DEBUG_OPTION_SPEC +#define ASM_DEBUG_OPTION_SPEC "" + #undef ASM_OUTPUT_ALIGNED_COMMON #define ASM_OUTPUT_ALIGNED_COMMON(FILE, NAME, SIZE, ALIGN) \ do { \ diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 0e73fea..763e770 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -67,6 +67,7 @@ UNSPECV_ICACHE_INV]) (define_c_enum "unspec" [ + UNSPEC_ADDPTR UNSPEC_VECTOR UNSPEC_BPERMUTE UNSPEC_SGPRBASE @@ -1219,29 +1220,47 @@ ; "addptr" is the same as "add" except that it must not write to VCC or SCC ; as a side-effect. Unfortunately GCN does not have a suitable instruction -; for this, so we use a custom VOP3 add with CC_SAVE_REG as a temp. -; Note that it is not safe to save/clobber/restore SCC because doing so will -; break data-flow analysis, so this must use vector registers. +; for this, so we use CC_SAVE_REG as a temp. +; Note that it is not safe to save/clobber/restore as separate insns because +; doing so will break data-flow analysis, so this must use multiple +; instructions in one insn. ; ; The "v0" should be just "v", but somehow the "0" helps LRA not loop forever ; on testcase pr54713-2.c with -O0. It's only an optimization hint anyway. +; +; The SGPR alternative is preferred as it is typically used with mov_sgprbase. (define_insn "addptrdi3" - [(set (match_operand:DI 0 "register_operand" "= v") - (plus:DI (match_operand:DI 1 "register_operand" " v0") - (match_operand:DI 2 "nonmemory_operand" "vDA")))] + [(set (match_operand:DI 0 "register_operand" "= v, Sg") + (unspec:DI [ + (plus:DI (match_operand:DI 1 "register_operand" "^v0,Sg0") + (match_operand:DI 2 "nonmemory_operand" "vDA,SgDB"))] + UNSPEC_ADDPTR))] "" { - rtx new_operands[4] = { operands[0], operands[1], operands[2], - gen_rtx_REG (DImode, CC_SAVE_REG) }; + if (which_alternative == 0) + { + rtx new_operands[4] = { operands[0], operands[1], operands[2], + gen_rtx_REG (DImode, CC_SAVE_REG) }; - output_asm_insn ("v_add%^_u32 %L0, %3, %L2, %L1", new_operands); - output_asm_insn ("v_addc%^_u32 %H0, %3, %H2, %H1, %3", new_operands); + output_asm_insn ("v_add%^_u32\t%L0, %3, %L2, %L1", new_operands); + output_asm_insn ("v_addc%^_u32\t%H0, %3, %H2, %H1, %3", new_operands); + } + else + { + rtx new_operands[4] = { operands[0], operands[1], operands[2], + gen_rtx_REG (BImode, CC_SAVE_REG) }; + + output_asm_insn ("s_mov_b32\t%3, scc", new_operands); + output_asm_insn ("s_add_u32\t%L0, %L1, %L2", new_operands); + output_asm_insn ("s_addc_u32\t%H0, %H1, %H2", new_operands); + output_asm_insn ("s_cmpk_lg_u32\t%3, 0", new_operands); + } return ""; } - [(set_attr "type" "vmult") - (set_attr "length" "16")]) + [(set_attr "type" "vmult,mult") + (set_attr "length" "16,24")]) ;; }}} ;; {{{ ALU special cases: Minus diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c index 0983b98..f7589a5 100644 --- a/gcc/config/gcn/mkoffload.c +++ b/gcc/config/gcn/mkoffload.c @@ -737,7 +737,8 @@ compile_native (const char *infile, const char *outfile, const char *compiler, obstack_ptr_grow (&argv_obstack, NULL); const char **new_argv = XOBFINISH (&argv_obstack, const char **); - fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true); + fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true, + ".gccnative_args"); obstack_free (&argv_obstack, NULL); } @@ -1001,7 +1002,7 @@ main (int argc, char **argv) unsetenv ("LIBRARY_PATH"); /* Run the compiler pass. */ - fork_execute (cc_argv[0], CONST_CAST (char **, cc_argv), true); + fork_execute (cc_argv[0], CONST_CAST (char **, cc_argv), true, ".gcc_args"); obstack_free (&cc_argv_obstack, NULL); in = fopen (gcn_s1_name, "r"); @@ -1022,7 +1023,7 @@ main (int argc, char **argv) fclose (out); /* Run the assemble/link pass. */ - fork_execute (ld_argv[0], CONST_CAST (char **, ld_argv), true); + fork_execute (ld_argv[0], CONST_CAST (char **, ld_argv), true, ".ld_args"); obstack_free (&ld_argv_obstack, NULL); in = fopen (gcn_o_name, "r"); diff --git a/gcc/config/i386/adxintrin.h b/gcc/config/i386/adxintrin.h index 6c15417..6dffe45 100644 --- a/gcc/config/i386/adxintrin.h +++ b/gcc/config/i386/adxintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <adxintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <adxintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _ADXINTRIN_H_INCLUDED diff --git a/gcc/config/i386/amxbf16intrin.h b/gcc/config/i386/amxbf16intrin.h new file mode 100644 index 0000000..77cc395 --- /dev/null +++ b/gcc/config/i386/amxbf16intrin.h @@ -0,0 +1,52 @@ +/* Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use <amxbf16intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMXBF16INTRIN_H_INCLUDED +#define _AMXBF16INTRIN_H_INCLUDED + +#if !defined(__AMX_BF16__) +#pragma GCC push_options +#pragma GCC target("amx-bf16") +#define __DISABLE_AMX_BF16__ +#endif /* __AMX_BF16__ */ + +#if defined(__x86_64__) && defined(__AMX_BF16__) +#define _tile_dpbf16ps_internal(dst,src1,src2) \ + __asm__ volatile\ + ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dpbf16ps(dst,src1,src2) \ + _tile_dpbf16ps_internal (dst, src1, src2) + +#endif + +#ifdef __DISABLE_AMX_BF16__ +#undef __DISABLE_AMX_BF16__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_BF16__ */ + +#endif /* _AMXBF16INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/amxint8intrin.h b/gcc/config/i386/amxint8intrin.h new file mode 100644 index 0000000..f4e410b --- /dev/null +++ b/gcc/config/i386/amxint8intrin.h @@ -0,0 +1,61 @@ +/* Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use <amxint8intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMXINT8INTRIN_H_INCLUDED +#define _AMXINT8INTRIN_H_INCLUDED + +#if !defined(__AMX_INT8__) +#pragma GCC push_options +#pragma GCC target("amx-int8") +#define __DISABLE_AMX_INT8__ +#endif /* __AMX_INT8__ */ + +#if defined(__x86_64__) && defined(__AMX_INT8__) +#define _tile_int8_dp_internal(name,dst,src1,src2) \ + __asm__ volatile \ + ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dpbssd(dst,src1,src2) \ + _tile_int8_dp_internal (tdpbssd, dst, src1, src2) + +#define _tile_dpbsud(dst,src1,src2) \ + _tile_int8_dp_internal (tdpbsud, dst, src1, src2) + +#define _tile_dpbusd(dst,src1,src2) \ + _tile_int8_dp_internal (tdpbusd, dst, src1, src2) + +#define _tile_dpbuud(dst,src1,src2) \ + _tile_int8_dp_internal (tdpbuud, dst, src1, src2) + +#endif + +#ifdef __DISABLE_AMX_INT8__ +#undef __DISABLE_AMX_INT8__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_INT8__ */ + +#endif /* _AMXINT8INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h new file mode 100644 index 0000000..41fb9a5 --- /dev/null +++ b/gcc/config/i386/amxtileintrin.h @@ -0,0 +1,98 @@ +/* Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use <amxtileintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMXTILEINTRIN_H_INCLUDED +#define _AMXTILEINTRIN_H_INCLUDED + +#if !defined(__AMX_TILE__) +#pragma GCC push_options +#pragma GCC target("amx-tile") +#define __DISABLE_AMX_TILE__ +#endif /* __AMX_TILE__ */ + +#if defined(__x86_64__) && defined(__AMX_TILE__) +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tile_loadconfig (const void *__config) +{ + __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tile_storeconfig (void *__config) +{ + __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tile_release (void) +{ + __asm__ volatile ("tilerelease" ::); +} + +#define _tile_loadd(dst,base,stride) \ + _tile_loadd_internal (dst, base, stride) + +#define _tile_loadd_internal(dst,base,stride) \ + __asm__ volatile \ + ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*) base), "r" ((long) stride)) + +#define _tile_stream_loadd(dst,base,stride) \ + _tile_stream_loadd_internal (dst, base, stride) + +#define _tile_stream_loadd_internal(dst,base,stride) \ + __asm__ volatile \ + ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*) base), "r" ((long) stride)) + +#define _tile_stored(dst,base,stride) \ + _tile_stored_internal (dst, base, stride) + +#define _tile_stored_internal(src,base,stride) \ + __asm__ volatile \ + ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \ + :: "r" ((void*) base), "r" ((long) stride) \ + : "memory") + +#define _tile_zero(dst) \ + _tile_zero_internal (dst) + +#define _tile_zero_internal(dst) \ + __asm__ volatile \ + ("tilezero\t%%tmm"#dst ::) + +#endif + +#ifdef __DISABLE_AMX_TILE__ +#undef __DISABLE_AMX_TILE__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_TILE__ */ + +#endif /* _AMXTILEINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 6bf1f8c..e29c532 100644 --- a/gcc/config/i386/avx2intrin.h +++ b/gcc/config/i386/avx2intrin.h @@ -950,6 +950,9 @@ _mm256_broadcastsi128_si256 (__m128i __X) return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); } +#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) +#define _mm_broadcastsd_pd(X) _mm_movedup_pd(X) + #ifdef __OPTIMIZE__ extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h index d19c104..3da05e1 100644 --- a/gcc/config/i386/avx512bwintrin.h +++ b/gcc/config/i386/avx512bwintrin.h @@ -36,7 +36,11 @@ /* Internal data types for implementing the intrinsics. */ typedef short __v32hi __attribute__ ((__vector_size__ (64))); +typedef short __v32hi_u __attribute__ ((__vector_size__ (64), \ + __may_alias__, __aligned__ (1))); typedef char __v64qi __attribute__ ((__vector_size__ (64))); +typedef char __v64qi_u __attribute__ ((__vector_size__ (64), \ + __may_alias__, __aligned__ (1))); typedef unsigned long long __mmask64; @@ -303,6 +307,13 @@ _mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi16 (void const *__P) +{ + return (__m512i) (*(__v32hi_u *) __P); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P, @@ -322,6 +333,13 @@ _mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi16 (void *__P, __m512i __A) +{ + *(__v32hi_u *) __P = (__v32hi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) { __builtin_ia32_storedquhi512_mask ((short *) __P, @@ -382,6 +400,13 @@ _kunpackd_mask64 (__mmask32 __A, __mmask32 __B) extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi8 (void const *__P) +{ + return (__m512i) (*(__v64qi_u *) __P); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P, @@ -401,6 +426,13 @@ _mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi8 (void *__P, __m512i __A) +{ + *(__v64qi_u *) __P = (__v64qi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) { __builtin_ia32_storedquqi512_mask ((char *) __P, diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h index d28dfab..fd61b70 100644 --- a/gcc/config/i386/avx512dqintrin.h +++ b/gcc/config/i386/avx512dqintrin.h @@ -1168,6 +1168,17 @@ _mm_reduce_sd (__m128d __A, __m128d __B, int __C) extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, int __C) { @@ -1179,6 +1190,17 @@ _mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A, extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) { return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, @@ -1187,6 +1209,18 @@ _mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) (__mmask8) __U); } +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + __U, __R); +} + extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_reduce_ss (__m128 __A, __m128 __B, int __C) @@ -1197,6 +1231,16 @@ _mm_reduce_ss (__m128 __A, __m128 __B, int __C) (__mmask8) -1); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_ss (__m128 __A, __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1211,6 +1255,17 @@ _mm_mask_reduce_ss (__m128 __W, __mmask8 __U, __m128 __A, extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) { return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, @@ -1219,6 +1274,18 @@ _mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) (__mmask8) __U); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + __U, __R); +} + extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_range_sd (__m128d __A, __m128d __B, int __C) @@ -1808,6 +1875,17 @@ _mm512_reduce_pd (__m512d __A, int __B) extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_pd (__m512d __A, int __B, const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_reduce_pd (__m512d __W, __mmask8 __U, __m512d __A, int __B) { return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, @@ -1817,6 +1895,17 @@ _mm512_mask_reduce_pd (__m512d __W, __mmask8 __U, __m512d __A, int __B) extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + int __B, const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) __W, + __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_reduce_pd (__mmask8 __U, __m512d __A, int __B) { return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, @@ -1825,6 +1914,18 @@ _mm512_maskz_reduce_pd (__mmask8 __U, __m512d __A, int __B) (__mmask8) __U); } +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_pd (__mmask8 __U, __m512d __A, int __B, + const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) + _mm512_setzero_pd (), + __U, __R); +} + extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_reduce_ps (__m512 __A, int __B) @@ -1837,6 +1938,17 @@ _mm512_reduce_ps (__m512 __A, int __B) extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_ps (__m512 __A, int __B, const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_reduce_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B) { return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, @@ -1846,6 +1958,17 @@ _mm512_mask_reduce_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B) extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B, + const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) __W, + __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_reduce_ps (__mmask16 __U, __m512 __A, int __B) { return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, @@ -1854,6 +1977,18 @@ _mm512_maskz_reduce_ps (__mmask16 __U, __m512 __A, int __B) (__mmask16) __U); } +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_ps (__mmask16 __U, __m512 __A, int __B, + const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); +} + extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_extractf32x8_ps (__m512 __A, const int __imm) @@ -2440,26 +2575,50 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \ (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1)) +#define _mm512_reduce_round_pd(A, B, R) \ + ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\ + (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R))) + #define _mm512_mask_reduce_pd(W, U, A, B) \ ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \ (int)(B), (__v8df)(__m512d)(W), (__mmask8)(U))) +#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \ + ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\ + (int)(B), (__v8df)(__m512d)(W), (U), (R))) + #define _mm512_maskz_reduce_pd(U, A, B) \ ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \ (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)(U))) +#define _mm512_maskz_reduce_round_pd(U, A, B, R) \ + ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\ + (int)(B), (__v8df)_mm512_setzero_pd (), (U), (R))) + #define _mm512_reduce_ps(A, B) \ ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \ (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1)) +#define _mm512_reduce_round_ps(A, B, R) \ + ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\ + (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R))) + #define _mm512_mask_reduce_ps(W, U, A, B) \ ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \ (int)(B), (__v16sf)(__m512)(W), (__mmask16)(U))) +#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \ + ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\ + (int)(B), (__v16sf)(__m512)(W), (U), (R))) + #define _mm512_maskz_reduce_ps(U, A, B) \ ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \ (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U))) +#define _mm512_maskz_reduce_round_ps(U, A, B, R) \ + ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\ + (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R))) + #define _mm512_extractf32x8_ps(X, C) \ ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), \ (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8)-1)) @@ -2679,6 +2838,20 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ (__mmask8)(U))) +#define _mm_reduce_round_sd(A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R))) + +#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8)(U), (int)(R))) + #define _mm_reduce_ss(A, B, C) \ ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ @@ -2693,6 +2866,19 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ (__mmask8)(U))) +#define _mm_reduce_round_ss(A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R))) + +#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ + ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(U), (int)(R))) #endif diff --git a/gcc/config/i386/avx512erintrin.h b/gcc/config/i386/avx512erintrin.h index b9804c9..6ec8ee2 100644 --- a/gcc/config/i386/avx512erintrin.h +++ b/gcc/config/i386/avx512erintrin.h @@ -168,6 +168,30 @@ _mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R) __R); } +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp28_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + __U, + __R); +} + extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R) @@ -177,6 +201,30 @@ _mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R) __R); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp28_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __U, + __R); +} + extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_rsqrt28_round_pd (__m512d __A, int __R) @@ -242,6 +290,30 @@ _mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R) __R); } +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt28_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + __U, + __R); +} + extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R) @@ -251,6 +323,30 @@ _mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R) __R); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt28_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __U, + __R); +} + #else #define _mm512_exp2a23_round_pd(A, C) \ __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) @@ -309,17 +405,69 @@ _mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R) #define _mm_rcp28_round_sd(A, B, R) \ __builtin_ia32_rcp28sd_round(A, B, R) +#define _mm_mask_rcp28_round_sd(W, U, A, B, R) \ + __builtin_ia32_rcp28sd_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rcp28_round_sd(U, A, B, R) \ + __builtin_ia32_rcp28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (), \ + (U), (R)) + #define _mm_rcp28_round_ss(A, B, R) \ __builtin_ia32_rcp28ss_round(A, B, R) +#define _mm_mask_rcp28_round_ss(W, U, A, B, R) \ + __builtin_ia32_rcp28ss_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rcp28_round_ss(U, A, B, R) \ + __builtin_ia32_rcp28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (), \ + (U), (R)) + #define _mm_rsqrt28_round_sd(A, B, R) \ __builtin_ia32_rsqrt28sd_round(A, B, R) +#define _mm_mask_rsqrt28_round_sd(W, U, A, B, R) \ + __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rsqrt28_round_sd(U, A, B, R) \ + __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (),\ + (U), (R)) + #define _mm_rsqrt28_round_ss(A, B, R) \ __builtin_ia32_rsqrt28ss_round(A, B, R) +#define _mm_mask_rsqrt28_round_ss(W, U, A, B, R) \ + __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rsqrt28_round_ss(U, A, B, R) \ + __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (),\ + (U), (R)) + #endif +#define _mm_mask_rcp28_sd(W, U, A, B)\ + _mm_mask_rcp28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_sd(U, A, B)\ + _mm_maskz_rcp28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rcp28_ss(W, U, A, B)\ + _mm_mask_rcp28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_ss(U, A, B)\ + _mm_maskz_rcp28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_sd(W, U, A, B)\ + _mm_mask_rsqrt28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_sd(U, A, B)\ + _mm_maskz_rsqrt28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_ss(W, U, A, B)\ + _mm_mask_rsqrt28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_ss(U, A, B)\ + _mm_maskz_rsqrt28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + #define _mm512_exp2a23_pd(A) \ _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION) diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h index 729d568..6342fde 100644 --- a/gcc/config/i386/avx512fintrin.h +++ b/gcc/config/i386/avx512fintrin.h @@ -2124,6 +2124,18 @@ _mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) (__v4sf) _mm_setzero_ps (), U, C) #endif +#define _mm_mask_sqrt_sd(W, U, A, B) \ + _mm_mask_sqrt_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_sqrt_sd(U, A, B) \ + _mm_maskz_sqrt_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_sqrt_ss(W, U, A, B) \ + _mm_mask_sqrt_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_sqrt_ss(U, A, B) \ + _mm_maskz_sqrt_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cvtepi8_epi32 (__m128i __A) @@ -3259,6 +3271,18 @@ _mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) (__v4sf)_mm_setzero_ps (), -1, C) #endif +#define _mm_mask_scalef_sd(W, U, A, B) \ + _mm_mask_scalef_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_scalef_sd(U, A, B) \ + _mm_maskz_scalef_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_scalef_ss(W, U, A, B) \ + _mm_mask_scalef_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_scalef_ss(U, A, B) \ + _mm_maskz_scalef_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + #ifdef __OPTIMIZE__ extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -8621,6 +8645,30 @@ _mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R) __R); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsd_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsd_ss (__mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + _mm_setzero_ps (), + __U, + __R); +} + extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) @@ -8629,6 +8677,30 @@ _mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) (__v4sf) __B, __R); } + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundss_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundss_sd (__mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + _mm_setzero_pd (), + __U, + __R); +} #else #define _mm512_cvt_roundpd_ps(A, B) \ (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B) @@ -8642,10 +8714,37 @@ _mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) #define _mm_cvt_roundsd_ss(A, B, C) \ (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C) +#define _mm_mask_cvt_roundsd_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), (W), (U), (C)) + +#define _mm_maskz_cvt_roundsd_ss(U, A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), _mm_setzero_ps (), \ + (U), (C)) + #define _mm_cvt_roundss_sd(A, B, C) \ (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C) + +#define _mm_mask_cvt_roundss_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), (W), (U), (C)) + +#define _mm_maskz_cvt_roundss_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), _mm_setzero_pd (), \ + (U), (C)) + #endif +#define _mm_mask_cvtss_sd(W, U, A, B) \ + _mm_mask_cvt_roundss_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_cvtss_sd(U, A, B) \ + _mm_maskz_cvt_roundss_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_cvtsd_ss(W, U, A, B) \ + _mm_mask_cvt_roundsd_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_cvtsd_ss(U, A, B) \ + _mm_maskz_cvt_roundsd_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_stream_si512 (__m512i * __P, __m512i __A) @@ -14265,6 +14364,14 @@ _mm_cvttss_i64 (__m128 __A) } #endif /* __x86_64__ */ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsi512_si32 (__m512i __A) +{ + __v16si __B = (__v16si) __A; + return __B[0]; +} + extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_u32 (__m128 __A) @@ -14289,6 +14396,34 @@ _mm_cvttss_i32 (__m128 __A) _MM_FROUND_CUR_DIRECTION); } +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_cvtsd2si ((__v2df) __A); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_cvtss2si ((__v4sf) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_sd (__m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_cvtsi2sd ((__v2df) __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_ss (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); +} + #ifdef __x86_64__ extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -14315,6 +14450,34 @@ _mm_cvttsd_i64 (__m128d __A) return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, _MM_FROUND_CUR_DIRECTION); } + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_cvtsd2si64 ((__v2df) __A); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_cvtss2si64 ((__v4sf) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_sd (__m128d __A, long long __B) +{ + return (__m128d) __builtin_ia32_cvtsi642sd ((__v2df) __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_ss (__m128 __A, long long __B) +{ + return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); +} #endif /* __x86_64__ */ extern __inline unsigned diff --git a/gcc/config/i386/avx512vlbwintrin.h b/gcc/config/i386/avx512vlbwintrin.h index cd4275e..b4b1d7f 100644 --- a/gcc/config/i386/avx512vlbwintrin.h +++ b/gcc/config/i386/avx512vlbwintrin.h @@ -34,6 +34,15 @@ #define __DISABLE_AVX512VLBW__ #endif /* __AVX512VLBW__ */ +/* Internal data types for implementing the intrinsics. */ +typedef short __v16hi_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef short __v8hi_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef char __v32qi_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef char __v16qi_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -75,6 +84,13 @@ _mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A) extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi8 (void *__P, __m256i __A) +{ + *(__v32qi_u *) __P = (__v32qi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) { __builtin_ia32_storedquqi256_mask ((char *) __P, @@ -84,6 +100,13 @@ _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi8 (void *__P, __m128i __A) +{ + *(__v16qi_u *) __P = (__v16qi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) { __builtin_ia32_storedquqi128_mask ((char *) __P, @@ -93,6 +116,13 @@ _mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi16 (void const *__P) +{ + return (__m256i) (*(__v16hi_u *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P, @@ -112,6 +142,13 @@ _mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P) extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi16 (void const *__P) +{ + return (__m128i) (*(__v8hi_u *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P, @@ -170,6 +207,13 @@ _mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A) extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi8 (void const *__P) +{ + return (__m256i) (*(__v32qi_u *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P, @@ -189,6 +233,13 @@ _mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P) extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi8 (void const *__P) +{ + return (__m128i) (*(__v16qi_u *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P, @@ -3710,6 +3761,13 @@ _mm256_cmple_epu16_mask (__m256i __X, __m256i __Y) extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi16 (void *__P, __m256i __A) +{ + *(__v16hi_u *) __P = (__v16hi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) { __builtin_ia32_storedquhi256_mask ((short *) __P, @@ -3719,6 +3777,13 @@ _mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi16 (void *__P, __m128i __A) +{ + *(__v8hi_u *) __P = (__v8hi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_storedquhi128_mask ((short *) __P, diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h index 7abd601..99666c7 100644 --- a/gcc/config/i386/avx512vlintrin.h +++ b/gcc/config/i386/avx512vlintrin.h @@ -36,6 +36,14 @@ /* Internal data types for implementing the intrinsics. */ typedef unsigned int __mmask32; +typedef int __v4si_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef int __v8si_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef long long __v2di_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef long long __v4di_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -265,6 +273,13 @@ _mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_epi64 (void const *__P) +{ + return (__m256i) (*(__v4di *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, @@ -286,6 +301,13 @@ _mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_epi64 (void const *__P) +{ + return (__m128i) (*(__v2di *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, @@ -363,6 +385,13 @@ _mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_epi32 (void const *__P) +{ + return (__m256i) (*(__v8si *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, @@ -384,6 +413,13 @@ _mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_epi32 (void const *__P) +{ + return (__m128i) (*(__v4si *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, @@ -405,6 +441,13 @@ _mm_maskz_load_epi32 (__mmask8 __U, void const *__P) extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_epi32 (void *__P, __m256i __A) +{ + *(__v8si *) __P = (__v8si) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_movdqa32store256_mask ((__v8si *) __P, @@ -414,6 +457,13 @@ _mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_epi32 (void *__P, __m128i __A) +{ + *(__v4si *) __P = (__v4si) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_movdqa32store128_mask ((__v4si *) __P, @@ -719,6 +769,13 @@ _mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi64 (void const *__P) +{ + return (__m256i) (*(__v4di_u *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P, @@ -738,6 +795,13 @@ _mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi64 (void const *__P) +{ + return (__m128i) (*(__v2di_u *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P, @@ -789,6 +853,13 @@ _mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi32 (void const *__P) +{ + return (__m256i) (*(__v8si_u *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P, @@ -808,6 +879,13 @@ _mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi32 (void const *__P) +{ + return (__m128i) (*(__v4si_u *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P, @@ -13730,6 +13808,13 @@ _mm256_permutex_pd (__m256d __X, const int __M) #endif #define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps ((B), (A)) +#define _mm256_mask_cvt_roundps_ph(A, B, C, D) \ + _mm256_mask_cvtps_ph ((A), (B), (C), (D)) +#define _mm256_maskz_cvt_roundps_ph(A, B, C) \ + _mm256_maskz_cvtps_ph ((A), (B), (C)) +#define _mm_mask_cvt_roundps_ph(A, B, C, D) \ + _mm_mask_cvtps_ph ((A), (B), (C), (D)) +#define _mm_maskz_cvt_roundps_ph(A, B, C) _mm_maskz_cvtps_ph ((A), (B), (C)) #ifdef __DISABLE_AVX512VL__ #undef __DISABLE_AVX512VL__ diff --git a/gcc/config/i386/avx512vp2intersectintrin.h b/gcc/config/i386/avx512vp2intersectintrin.h index 60cb52c..f368d83 100644 --- a/gcc/config/i386/avx512vp2intersectintrin.h +++ b/gcc/config/i386/avx512vp2intersectintrin.h @@ -1,3 +1,26 @@ +/* Copyright (C) 2019-2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + #if !defined _IMMINTRIN_H_INCLUDED #error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead." #endif diff --git a/gcc/config/i386/avx512vp2intersectvlintrin.h b/gcc/config/i386/avx512vp2intersectvlintrin.h index 26eee36..f657840 100644 --- a/gcc/config/i386/avx512vp2intersectvlintrin.h +++ b/gcc/config/i386/avx512vp2intersectvlintrin.h @@ -1,3 +1,26 @@ +/* Copyright (C) 2019-2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + #if !defined _IMMINTRIN_H_INCLUDED #error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead." #endif diff --git a/gcc/config/i386/avxintrin.h b/gcc/config/i386/avxintrin.h index 22b2bae..fd5cf6a 100644 --- a/gcc/config/i386/avxintrin.h +++ b/gcc/config/i386/avxintrin.h @@ -444,6 +444,13 @@ _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) (__v4sf)(__m128)(Y), (int)(P))) #endif +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsi256_si32 (__m256i __A) +{ + __v8si __B = (__v8si) __A; + return __B[0]; +} + extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_cvtepi32_pd (__m128i __A) { diff --git a/gcc/config/i386/bmi2intrin.h b/gcc/config/i386/bmi2intrin.h index c5de9eb..9fdd08c 100644 --- a/gcc/config/i386/bmi2intrin.h +++ b/gcc/config/i386/bmi2intrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED -# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _BMI2INTRIN_H_INCLUDED diff --git a/gcc/config/i386/bmiintrin.h b/gcc/config/i386/bmiintrin.h index 8ba6e5b..5bd712a 100644 --- a/gcc/config/i386/bmiintrin.h +++ b/gcc/config/i386/bmiintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED -# error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _BMIINTRIN_H_INCLUDED diff --git a/gcc/config/i386/cetintrin.h b/gcc/config/i386/cetintrin.h index 095bbe0..81c4d72 100644 --- a/gcc/config/i386/cetintrin.h +++ b/gcc/config/i386/cetintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <cetintrin.h> directly; include <x86intrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <cetintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _CETINTRIN_H_INCLUDED diff --git a/gcc/config/i386/cldemoteintrin.h b/gcc/config/i386/cldemoteintrin.h index 8c0feca..0c31c35 100644 --- a/gcc/config/i386/cldemoteintrin.h +++ b/gcc/config/i386/cldemoteintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <cldemoteintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <cldemoteintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _CLDEMOTE_H_INCLUDED diff --git a/gcc/config/i386/clflushoptintrin.h b/gcc/config/i386/clflushoptintrin.h index 037f044..a3697f0 100644 --- a/gcc/config/i386/clflushoptintrin.h +++ b/gcc/config/i386/clflushoptintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <clflushoptintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _CLFLUSHOPTINTRIN_H_INCLUDED diff --git a/gcc/config/i386/clwbintrin.h b/gcc/config/i386/clwbintrin.h index 84d0939..3f83962 100644 --- a/gcc/config/i386/clwbintrin.h +++ b/gcc/config/i386/clwbintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <clwbintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <clwbintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _CLWBINTRIN_H_INCLUDED diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index af37f5c..0b902d5 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -183,6 +183,10 @@ "@internal Memory operand without REX prefix." (match_operand 0 "norex_memory_operand")) +(define_special_memory_constraint "Br" + "@internal bcst memory operand." + (match_operand 0 "bcst_mem_operand")) + (define_constraint "Bs" "@internal Sibcall memory operand." (ior (and (not (match_test "TARGET_INDIRECT_BRANCH_REGISTER")) diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index bca61d62..22d284e 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -26,6 +26,7 @@ /* %eax */ #define bit_AVX512BF16 (1 << 5) +#define bit_HRESET (1 << 22) /* %ecx */ #define bit_SSE3 (1 << 0) @@ -124,9 +125,13 @@ #define bit_AVX5124FMAPS (1 << 3) #define bit_AVX512VP2INTERSECT (1 << 8) #define bit_IBT (1 << 20) +#define bit_UINTR (1 << 5) #define bit_PCONFIG (1 << 18) #define bit_SERIALIZE (1 << 14) #define bit_TSXLDTRK (1 << 16) +#define bit_AMX_BF16 (1 << 22) +#define bit_AMX_TILE (1 << 24) +#define bit_AMX_INT8 (1 << 25) /* XFEATURE_ENABLED_MASK register bits (%eax == 0xd, %ecx == 0) */ #define bit_BNDREGS (1 << 3) diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h index 545d3bf..8ff240e 100644 --- a/gcc/config/i386/emmintrin.h +++ b/gcc/config/i386/emmintrin.h @@ -715,6 +715,19 @@ _mm_loadu_si64 (void const *__P) return _mm_loadl_epi64 ((__m128i_u *)__P); } +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si32 (void const *__P) +{ + return _mm_set_epi32 (*(int *)__P, (int)0, (int)0, (int)0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si16 (void const *__P) +{ + return _mm_set_epi16 (*(short *)__P, (short)0, (short)0, (short)0, + (short)0, (short)0, (short)0, (short)0); +} + extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_si128 (__m128i *__P, __m128i __B) { @@ -739,6 +752,18 @@ _mm_storeu_si64 (void *__P, __m128i __B) _mm_storel_epi64 ((__m128i_u *)__P, __B); } +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si32 (void *__P, __m128i __B) +{ + *(__m32_u *)__P = (__m32) ((__v4si)__B)[0]; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si16 (void *__P, __m128i __B) +{ + *(__m16_u *)__P = (__m16) ((__v8hi)__B)[0]; +} + extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movepi64_pi64 (__m128i __B) { diff --git a/gcc/config/i386/enqcmdintrin.h b/gcc/config/i386/enqcmdintrin.h index 4b2efcb..dcb6507 100644 --- a/gcc/config/i386/enqcmdintrin.h +++ b/gcc/config/i386/enqcmdintrin.h @@ -21,12 +21,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <enqcmdntrin.h> directly; include <x86intrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <enqcmdintrin.h> directly; include <x86gprintrin.h> instead." #endif -#ifndef _ENQCMDNTRIN_H_INCLUDED -#define _ENQCMDNTRIN_H_INCLUDED +#ifndef _ENQCMDINTRIN_H_INCLUDED +#define _ENQCMDINTRIN_H_INCLUDED #ifndef __ENQCMD__ #pragma GCC push_options @@ -52,4 +52,4 @@ _enqcmds (void * __P, const void * __Q) #undef __DISABLE_ENQCMD__ #pragma GCC pop_options #endif /* __DISABLE_ENQCMD__ */ -#endif /* _ENQCMDNTRIN_H_INCLUDED. */ +#endif /* _ENQCMDINTRIN_H_INCLUDED. */ diff --git a/gcc/config/i386/fxsrintrin.h b/gcc/config/i386/fxsrintrin.h index fde05a7..6e059df 100644 --- a/gcc/config/i386/fxsrintrin.h +++ b/gcc/config/i386/fxsrintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <fxsrintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _FXSRINTRIN_H_INCLUDED diff --git a/gcc/config/i386/hresetintrin.h b/gcc/config/i386/hresetintrin.h new file mode 100644 index 0000000..bdbe253 --- /dev/null +++ b/gcc/config/i386/hresetintrin.h @@ -0,0 +1,48 @@ +/* Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _X86GPRINTRIN_H_INCLUDED +# error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead." +#endif + +#ifndef _HRESETINTRIN_H_INCLUDED +#define _HRESETINTRIN_H_INCLUDED + +#ifndef __HRESET__ +#pragma GCC push_options +#pragma GCC target ("hreset") +#define __DISABLE_HRESET__ +#endif /* __HRESET__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_hreset (unsigned int __EAX) +{ + __builtin_ia32_hreset (__EAX); +} + +#ifdef __DISABLE_HRESET__ +#undef __DISABLE_HRESET__ +#pragma GCC pop_options +#endif /* __DISABLE_HRESET__ */ +#endif /* _HRESETINTRIN_H_INCLUDED. */ diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 1adf7c4..964633d 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -194,6 +194,7 @@ DEF_FUNCTION_TYPE (UNSIGNED) DEF_FUNCTION_TYPE (UINT) DEF_FUNCTION_TYPE (USHORT) DEF_FUNCTION_TYPE (INT) +DEF_FUNCTION_TYPE (UINT8) DEF_FUNCTION_TYPE (VOID) DEF_FUNCTION_TYPE (PVOID) @@ -443,6 +444,7 @@ DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, INT) DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, INT, V8DF, UQI) DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, INT, V8DF, QI, INT) DEF_FUNCTION_TYPE (V8DF, V8DF, INT, V8DF, UQI) +DEF_FUNCTION_TYPE (V8DF, V8DF, INT, V8DF, UQI, INT) DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DI, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI, INT, UQI) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT, UQI) @@ -452,6 +454,7 @@ DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT) DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT, V16SF, UHI) DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT, V16SF, HI, INT) DEF_FUNCTION_TYPE (V16SF, V16SF, INT, V16SF, UHI) +DEF_FUNCTION_TYPE (V16SF, V16SF, INT, V16SF, UHI, INT) DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI, INT, V16SI, UHI) DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT) DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT, HI, INT) @@ -1026,8 +1029,10 @@ DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, UQI, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, UQI, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, QI, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, V4SF, QI, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, V4SF, UQI, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, QI, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, V2DF, QI, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, V2DF, UQI, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, INT) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index fec5cef..882cba5 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -452,6 +452,14 @@ BDESC (0, OPTION_MASK_ISA2_SERIALIZE, CODE_FOR_serialize, "__builtin_ia32_serial BDESC (0, OPTION_MASK_ISA2_TSXLDTRK, CODE_FOR_xsusldtrk, "__builtin_ia32_xsusldtrk", IX86_BUILTIN_XSUSLDTRK, UNKNOWN, (int) VOID_FTYPE_VOID) BDESC (0, OPTION_MASK_ISA2_TSXLDTRK, CODE_FOR_xresldtrk, "__builtin_ia32_xresldtrk", IX86_BUILTIN_XRESLDTRK, UNKNOWN, (int) VOID_FTYPE_VOID) +/* UINTR. */ +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_UINTR, CODE_FOR_clui, "__builtin_ia32_clui", IX86_BUILTIN_CLUI, UNKNOWN, (int) VOID_FTYPE_VOID) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_UINTR, CODE_FOR_stui, "__builtin_ia32_stui", IX86_BUILTIN_STUI, UNKNOWN, (int) VOID_FTYPE_VOID) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_UINTR, CODE_FOR_senduipi, "__builtin_ia32_senduipi", IX86_BUILTIN_SENDUIPI, UNKNOWN, (int) VOID_FTYPE_UINT64) + +/* HRESET */ +BDESC (0, OPTION_MASK_ISA2_HRESET, CODE_FOR_hreset, "__builtin_ia32_hreset", IX86_BUILTIN_HRESET, UNKNOWN, (int) VOID_FTYPE_UNSIGNED) + BDESC_END (SPECIAL_ARGS, ARGS) /* Builtins with variable number of arguments. */ @@ -2772,10 +2780,12 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_fix_notruncv16sfv16si_mask_r BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_cvtps2pd512_mask_round, "__builtin_ia32_cvtps2pd512_mask", IX86_BUILTIN_CVTPS2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SF_V8DF_QI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_ufix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2udq512_mask", IX86_BUILTIN_CVTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtsd2ss_round, "__builtin_ia32_cvtsd2ss_round", IX86_BUILTIN_CVTSD2SS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF_INT) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtsd2ss_mask_round, "__builtin_ia32_cvtsd2ss_mask_round", IX86_BUILTIN_CVTSD2SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse2_cvtsi2sdq_round, "__builtin_ia32_cvtsi2sd64", IX86_BUILTIN_CVTSI2SD64, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT64_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse_cvtsi2ss_round, "__builtin_ia32_cvtsi2ss32", IX86_BUILTIN_CVTSI2SS32, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT_INT) BDESC (OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse_cvtsi2ssq_round, "__builtin_ia32_cvtsi2ss64", IX86_BUILTIN_CVTSI2SS64, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT64_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtss2sd_round, "__builtin_ia32_cvtss2sd_round", IX86_BUILTIN_CVTSS2SD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF_INT) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtss2sd_mask_round, "__builtin_ia32_cvtss2sd_mask_round", IX86_BUILTIN_CVTSS2SD_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fix_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2dq512_mask", IX86_BUILTIN_CVTTPD2DQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fixuns_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2udq512_mask", IX86_BUILTIN_CVTTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fix_truncv16sfv16si2_mask_round, "__builtin_ia32_cvttps2dq512_mask", IX86_BUILTIN_CVTTPS2DQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT) @@ -2911,13 +2921,21 @@ BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf_mask_round, "__b BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_rcp28v8df_mask_round, "__builtin_ia32_rcp28pd_mask", IX86_BUILTIN_RCP28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_rcp28v16sf_mask_round, "__builtin_ia32_rcp28ps_mask", IX86_BUILTIN_RCP28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrcp28v2df_round, "__builtin_ia32_rcp28sd_round", IX86_BUILTIN_RCP28SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT) +BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrcp28v2df_mask_round, "__builtin_ia32_rcp28sd_mask_round", IX86_BUILTIN_RCP28SD_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrcp28v4sf_round, "__builtin_ia32_rcp28ss_round", IX86_BUILTIN_RCP28SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) +BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrcp28v4sf_mask_round, "__builtin_ia32_rcp28ss_mask_round", IX86_BUILTIN_RCP28SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_rsqrt28v8df_mask_round, "__builtin_ia32_rsqrt28pd_mask", IX86_BUILTIN_RSQRT28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_rsqrt28v16sf_mask_round, "__builtin_ia32_rsqrt28ps_mask", IX86_BUILTIN_RSQRT28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrsqrt28v2df_round, "__builtin_ia32_rsqrt28sd_round", IX86_BUILTIN_RSQRT28SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT) +BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrsqrt28v2df_mask_round, "__builtin_ia32_rsqrt28sd_mask_round", IX86_BUILTIN_RSQRT28SD_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrsqrt28v4sf_round, "__builtin_ia32_rsqrt28ss_round", IX86_BUILTIN_RSQRT28SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) +BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrsqrt28v4sf_mask_round, "__builtin_ia32_rsqrt28ss_mask_round", IX86_BUILTIN_RSQRT28SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT) /* AVX512DQ. */ +BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv8df_mask_round, "__builtin_ia32_reducepd512_mask_round", IX86_BUILTIN_REDUCEPD512_MASK_ROUND, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT) +BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv16sf_mask_round, "__builtin_ia32_reduceps512_mask_round", IX86_BUILTIN_REDUCEPS512_MASK_ROUND, UNKNOWN, (int) V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT) +BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducesv2df_mask_round, "__builtin_ia32_reducesd_mask_round", IX86_BUILTIN_REDUCESD128_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT) +BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducesv4sf_mask_round, "__builtin_ia32_reducess_mask_round", IX86_BUILTIN_REDUCESS128_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangesv2df_mask_round, "__builtin_ia32_rangesd128_mask_round", IX86_BUILTIN_RANGESD128, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangesv4sf_mask_round, "__builtin_ia32_rangess128_mask_round", IX86_BUILTIN_RANGESS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fix_notruncv8dfv8di2_mask_round, "__builtin_ia32_cvtpd2qq512_mask", IX86_BUILTIN_CVTPD2QQ512, UNKNOWN, (int) V8DI_FTYPE_V8DF_V8DI_QI_INT) diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c index ca7a870..504987a 100644 --- a/gcc/config/i386/i386-builtins.c +++ b/gcc/config/i386/i386-builtins.c @@ -1194,6 +1194,11 @@ ix86_init_mmx_sse_builtins (void) def_builtin (0, OPTION_MASK_ISA2_WAITPKG, "__builtin_ia32_tpause", UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE); + /* UINTR. */ + def_builtin (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_UINTR, + "__builtin_ia32_testui", + UINT8_FTYPE_VOID, IX86_BUILTIN_TESTUI); + /* CLDEMOTE. */ def_builtin (0, OPTION_MASK_ISA2_CLDEMOTE, "__builtin_ia32_cldemote", VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE); diff --git a/gcc/config/i386/i386-builtins.h b/gcc/config/i386/i386-builtins.h index cc6a8ce..a88cc0c 100644 --- a/gcc/config/i386/i386-builtins.h +++ b/gcc/config/i386/i386-builtins.h @@ -40,6 +40,7 @@ enum ix86_builtins IX86_BUILTIN_UMONITOR, IX86_BUILTIN_UMWAIT, IX86_BUILTIN_TPAUSE, + IX86_BUILTIN_TESTUI, IX86_BUILTIN_CLZERO, IX86_BUILTIN_CLDEMOTE, IX86_BUILTIN_VEC_INIT_V2SI, diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index 3553a37..bbe9ac5 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -588,6 +588,20 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__ENQCMD__"); if (isa_flag2 & OPTION_MASK_ISA2_TSXLDTRK) def_or_undef (parse_in, "__TSXLDTRK__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_TILE) + def_or_undef (parse_in, "__AMX_TILE__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_INT8) + def_or_undef (parse_in, "__AMX_INT8__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_BF16) + def_or_undef (parse_in, "__AMX_BF16__"); + if (isa_flag & OPTION_MASK_ISA_SAHF) + def_or_undef (parse_in, "__LAHF_SAHF__"); + if (isa_flag2 & OPTION_MASK_ISA2_MOVBE) + def_or_undef (parse_in, "__MOVBE__"); + if (isa_flag2 & OPTION_MASK_ISA2_UINTR) + def_or_undef (parse_in, "__UINTR__"); + if (isa_flag2 & OPTION_MASK_ISA2_HRESET) + def_or_undef (parse_in, "__HRESET__"); if (TARGET_IAMCU) { def_or_undef (parse_in, "__iamcu"); diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index e6f8b31..3e8afe6 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -1045,7 +1045,8 @@ ix86_binary_operator_ok (enum rtx_code code, machine_mode mode, rtx src2 = operands[2]; /* Both source operands cannot be in memory. */ - if (MEM_P (src1) && MEM_P (src2)) + if ((MEM_P (src1) || bcst_mem_operand (src1, mode)) + && (MEM_P (src2) || bcst_mem_operand (src2, mode))) return false; /* Canonicalize operand order for commutative operators. */ @@ -3525,6 +3526,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) machine_mode mode = GET_MODE (dest); machine_mode cmpmode = GET_MODE (cmp); + /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */ + if (rtx_equal_p (op_true, op_false)) + { + emit_move_insn (dest, op_true); + return; + } + /* In AVX512F the result of comparison is an integer mask. */ bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode); @@ -10225,12 +10233,16 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: + case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT: case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT: case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT: + case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT: nargs = 5; break; case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT: case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT: + case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT: + case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT: nargs_constant = 4; nargs = 5; break; @@ -10413,6 +10425,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case USHORT_FTYPE_VOID: case UINT64_FTYPE_VOID: case UINT_FTYPE_VOID: + case UINT8_FTYPE_VOID: case UNSIGNED_FTYPE_VOID: nargs = 0; klass = load; @@ -11203,6 +11216,19 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, return target; + case IX86_BUILTIN_TESTUI: + emit_insn (gen_testui ()); + + if (target == 0 + || !register_operand (target, QImode)) + target = gen_reg_rtx (QImode); + + pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG), + const0_rtx); + emit_insn (gen_rtx_SET (target, pat)); + + return target; + case IX86_BUILTIN_CLZERO: arg0 = CALL_EXPR_ARG (exp, 0); op0 = expand_normal (arg0); @@ -12807,6 +12833,14 @@ rdseed_step: emit_insn (gen_incssp (mode, op0)); return 0; + case IX86_BUILTIN_HRESET: + icode = CODE_FOR_hreset; + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + op0 = force_reg (SImode, op0); + emit_insn (gen_hreset (op0)); + return 0; + case IX86_BUILTIN_RSTORSSP: case IX86_BUILTIN_CLRSSBSY: arg0 = CALL_EXPR_ARG (exp, 0); diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 2fabd20..82c8091 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -209,7 +209,12 @@ static struct ix86_target_opts isa2_opts[] = { "-mavx512bf16", OPTION_MASK_ISA2_AVX512BF16 }, { "-menqcmd", OPTION_MASK_ISA2_ENQCMD }, { "-mserialize", OPTION_MASK_ISA2_SERIALIZE }, - { "-mtsxldtrk", OPTION_MASK_ISA2_TSXLDTRK } + { "-mtsxldtrk", OPTION_MASK_ISA2_TSXLDTRK }, + { "-mamx-tile", OPTION_MASK_ISA2_AMX_TILE }, + { "-mamx-int8", OPTION_MASK_ISA2_AMX_INT8 }, + { "-mamx-bf16", OPTION_MASK_ISA2_AMX_BF16 }, + { "-muintr", OPTION_MASK_ISA2_UINTR }, + { "-mhreset", OPTION_MASK_ISA2_HRESET } }; static struct ix86_target_opts isa_opts[] = { @@ -1028,11 +1033,16 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b), IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg), IX86_ATTR_ISA ("cldemote", OPT_mcldemote), + IX86_ATTR_ISA ("uintr", OPT_muintr), IX86_ATTR_ISA ("ptwrite", OPT_mptwrite), IX86_ATTR_ISA ("avx512bf16", OPT_mavx512bf16), IX86_ATTR_ISA ("enqcmd", OPT_menqcmd), IX86_ATTR_ISA ("serialize", OPT_mserialize), IX86_ATTR_ISA ("tsxldtrk", OPT_mtsxldtrk), + IX86_ATTR_ISA ("amx-tile", OPT_mamx_tile), + IX86_ATTR_ISA ("amx-int8", OPT_mamx_int8), + IX86_ATTR_ISA ("amx-bf16", OPT_mamx_bf16), + IX86_ATTR_ISA ("hreset", OPT_mhreset), /* enum options */ IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), @@ -1893,6 +1903,9 @@ ix86_option_override_internal (bool main_args_p, opts->x_ix86_stringop_alg = no_stringop; } + if (TARGET_UINTR && !TARGET_64BIT) + error ("%<-muintr%> not supported for 32-bit code"); + if (!opts->x_ix86_arch_string) opts->x_ix86_arch_string = TARGET_64BIT_P (opts->x_ix86_isa_flags) @@ -2052,10 +2065,27 @@ ix86_option_override_internal (bool main_args_p, return false; } + /* The feature-only micro-architecture levels that use + PTA_NO_TUNE are only defined for the x86-64 psABI. */ + if ((processor_alias_table[i].flags & PTA_NO_TUNE) != 0 + && (!TARGET_64BIT_P (opts->x_ix86_isa_flags) + || opts->x_ix86_abi != SYSV_ABI)) + { + error (G_("%<%s%> architecture level is only defined" + " for the x86-64 psABI"), opts->x_ix86_arch_string); + return false; + } + ix86_schedule = processor_alias_table[i].schedule; ix86_arch = processor_alias_table[i].processor; - /* Default cpu tuning to the architecture. */ - ix86_tune = ix86_arch; + + /* Default cpu tuning to the architecture, unless the table + entry requests not to do this. Used by the x86-64 psABI + micro-architecture levels. */ + if ((processor_alias_table[i].flags & PTA_NO_TUNE) == 0) + ix86_tune = ix86_arch; + else + ix86_tune = PROCESSOR_GENERIC; if (((processor_alias_table[i].flags & PTA_MMX) != 0) && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX)) @@ -2258,6 +2288,18 @@ ix86_option_override_internal (bool main_args_p, && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_AVX512BF16)) opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX512BF16; + if (((processor_alias_table[i].flags & PTA_AMX_TILE) != 0) + && !(opts->x_ix86_isa_flags2_explicit + & OPTION_MASK_ISA2_AMX_TILE)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_TILE; + if (((processor_alias_table[i].flags & PTA_AMX_INT8) != 0) + && !(opts->x_ix86_isa_flags2_explicit + & OPTION_MASK_ISA2_AMX_INT8)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_INT8; + if (((processor_alias_table[i].flags & PTA_AMX_BF16) != 0) + && !(opts->x_ix86_isa_flags2_explicit + & OPTION_MASK_ISA2_AMX_BF16)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_BF16; if (((processor_alias_table[i].flags & PTA_MOVDIRI) != 0) && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVDIRI)) opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVDIRI; @@ -2366,7 +2408,8 @@ ix86_option_override_internal (bool main_args_p, ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask); for (i = 0; i < pta_size; i++) - if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name)) + if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name) + && (processor_alias_table[i].flags & PTA_NO_TUNE) == 0) { ix86_schedule = processor_alias_table[i].schedule; ix86_tune = processor_alias_table[i].processor; @@ -2410,8 +2453,9 @@ ix86_option_override_internal (bool main_args_p, auto_vec <const char *> candidates; for (i = 0; i < pta_size; i++) - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) - || ((processor_alias_table[i].flags & PTA_64BIT) != 0)) + if ((!TARGET_64BIT_P (opts->x_ix86_isa_flags) + || ((processor_alias_table[i].flags & PTA_64BIT) != 0)) + && (processor_alias_table[i].flags & PTA_NO_TUNE) == 0) candidates.safe_push (processor_alias_table[i].name); #ifdef HAVE_LOCAL_CPU_DETECT diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index c890a73..502d240 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1484,7 +1484,7 @@ ix86_reg_parm_stack_space (const_tree fndecl) bool ix86_libc_has_function (enum function_class fn_class) { - return targetm.libc_has_function (fn_class); + return targetm.libc_has_function (fn_class, NULL_TREE); } /* Returns value SYSV_ABI, MS_ABI dependent on fntype, @@ -13098,6 +13098,43 @@ ix86_print_operand (FILE *file, rtx x, int code) fputs (dstr, file); } + /* Print bcst_mem_operand. */ + else if (GET_CODE (x) == VEC_DUPLICATE) + { + machine_mode vmode = GET_MODE (x); + /* Must be bcst_memory_operand. */ + gcc_assert (bcst_mem_operand (x, vmode)); + + rtx mem = XEXP (x,0); + ix86_print_operand (file, mem, 0); + + switch (vmode) + { + case E_V2DImode: + case E_V2DFmode: + fputs ("{1to2}", file); + break; + case E_V4SImode: + case E_V4SFmode: + case E_V4DImode: + case E_V4DFmode: + fputs ("{1to4}", file); + break; + case E_V8SImode: + case E_V8SFmode: + case E_V8DFmode: + case E_V8DImode: + fputs ("{1to8}", file); + break; + case E_V16SFmode: + case E_V16SImode: + fputs ("{1to16}", file); + break; + default: + gcc_unreachable (); + } + } + else { /* We have patterns that allow zero sets of memory, for instance. @@ -15131,11 +15168,32 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1) /* Codes needing carry flag. */ case GEU: /* CF=0 */ case LTU: /* CF=1 */ + rtx geu; /* Detect overflow checks. They need just the carry flag. */ if (GET_CODE (op0) == PLUS && (rtx_equal_p (op1, XEXP (op0, 0)) || rtx_equal_p (op1, XEXP (op0, 1)))) return CCCmode; + /* Similarly for *setcc_qi_addqi3_cconly_overflow_1_* patterns. + Match LTU of op0 + (neg:QI (geu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))) + and op1 + (ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0)) + where CC_CCC is either CC or CCC. */ + else if (code == LTU + && GET_CODE (op0) == NEG + && GET_CODE (geu = XEXP (op0, 0)) == GEU + && REG_P (XEXP (geu, 0)) + && (GET_MODE (XEXP (geu, 0)) == CCCmode + || GET_MODE (XEXP (geu, 0)) == CCmode) + && REGNO (XEXP (geu, 0)) == FLAGS_REG + && XEXP (geu, 1) == const0_rtx + && GET_CODE (op1) == LTU + && REG_P (XEXP (op1, 0)) + && GET_MODE (XEXP (op1, 0)) == GET_MODE (XEXP (geu, 0)) + && REGNO (XEXP (op1, 0)) == FLAGS_REG + && XEXP (op1, 1) == const0_rtx) + return CCCmode; else return CCmode; case GTU: /* CF=0 & ZF=0 */ @@ -19749,33 +19807,56 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, return false; case COMPARE: - if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT - && XEXP (XEXP (x, 0), 1) == const1_rtx - && CONST_INT_P (XEXP (XEXP (x, 0), 2)) - && XEXP (x, 1) == const0_rtx) + rtx op0, op1; + op0 = XEXP (x, 0); + op1 = XEXP (x, 1); + if (GET_CODE (op0) == ZERO_EXTRACT + && XEXP (op0, 1) == const1_rtx + && CONST_INT_P (XEXP (op0, 2)) + && op1 == const0_rtx) { /* This kind of construct is implemented using test[bwl]. Treat it as if we had an AND. */ - mode = GET_MODE (XEXP (XEXP (x, 0), 0)); + mode = GET_MODE (XEXP (op0, 0)); *total = (cost->add - + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code, + + rtx_cost (XEXP (op0, 0), mode, outer_code, opno, speed) + rtx_cost (const1_rtx, mode, outer_code, opno, speed)); return true; } - if (GET_CODE (XEXP (x, 0)) == PLUS - && rtx_equal_p (XEXP (XEXP (x, 0), 0), XEXP (x, 1))) + if (GET_CODE (op0) == PLUS && rtx_equal_p (XEXP (op0, 0), op1)) { /* This is an overflow detection, count it as a normal compare. */ - *total = rtx_cost (XEXP (x, 0), GET_MODE (XEXP (x, 0)), - COMPARE, 0, speed); + *total = rtx_cost (op0, GET_MODE (op0), COMPARE, 0, speed); + return true; + } + + rtx geu; + /* Match x + (compare:CCC (neg:QI (geu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))) + (ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))) */ + if (mode == CCCmode + && GET_CODE (op0) == NEG + && GET_CODE (geu = XEXP (op0, 0)) == GEU + && REG_P (XEXP (geu, 0)) + && (GET_MODE (XEXP (geu, 0)) == CCCmode + || GET_MODE (XEXP (geu, 0)) == CCmode) + && REGNO (XEXP (geu, 0)) == FLAGS_REG + && XEXP (geu, 1) == const0_rtx + && GET_CODE (op1) == LTU + && REG_P (XEXP (op1, 0)) + && GET_MODE (XEXP (op1, 0)) == GET_MODE (XEXP (geu, 0)) + && REGNO (XEXP (op1, 0)) == FLAGS_REG + && XEXP (op1, 1) == const0_rtx) + { + /* This is *setcc_qi_addqi3_cconly_overflow_1_* patterns, a nop. */ + *total = 0; return true; } /* The embedded comparison operand is completely free. */ - if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0))) - && XEXP (x, 1) == const0_rtx) + if (!general_operand (op0, GET_MODE (op0)) && op1 == const0_rtx) *total = 0; return false; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 92b7475..24207d0 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -203,6 +203,16 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define TARGET_SERIALIZE_P(x) TARGET_ISA2_SERIALIZE_P(x) #define TARGET_TSXLDTRK TARGET_ISA2_TSXLDTRK #define TARGET_TSXLDTRK_P(x) TARGET_ISA2_TSXLDTRK_P(x) +#define TARGET_AMX_TILE TARGET_ISA2_AMX_TILE +#define TARGET_AMX_TILE_P(x) TARGET_ISA2_AMX_TILE(x) +#define TARGET_AMX_INT8 TARGET_ISA2_AMX_INT8 +#define TARGET_AMX_INT8_P(x) TARGET_ISA2_AMX_INT8(x) +#define TARGET_AMX_BF16 TARGET_ISA2_AMX_BF16 +#define TARGET_AMX_BF16_P(x) TARGET_ISA2_AMX_BF16(x) +#define TARGET_UINTR TARGET_ISA2_UINTR +#define TARGET_UINTR_P(x) TARGET_ISA2_UINTR_P(x) +#define TARGET_HRESET TARGET_ISA2_HRESET +#define TARGET_HRESET_P(x) TARGET_ISA2_HRESET_P(x) #define TARGET_LP64 TARGET_ABI_64 #define TARGET_LP64_P(x) TARGET_ABI_64_P(x) @@ -1262,6 +1272,10 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); (TARGET_FMA4 && ((MODE) == V4SFmode || (MODE) == V2DFmode \ || (MODE) == V8SFmode || (MODE) == V4DFmode)) +#define VALID_BCST_MODE_P(MODE) \ + ((MODE) == SFmode || (MODE) == DFmode \ + || (MODE) == SImode || (MODE) == DImode) + /* It is possible to write patterns to move flags; but until someone does it, */ #define AVOID_CCMODE_COPIES @@ -2427,7 +2441,7 @@ const wide_int_bitmask PTA_AVX512F (HOST_WIDE_INT_1U << 40); const wide_int_bitmask PTA_AVX512ER (HOST_WIDE_INT_1U << 41); const wide_int_bitmask PTA_AVX512PF (HOST_WIDE_INT_1U << 42); const wide_int_bitmask PTA_AVX512CD (HOST_WIDE_INT_1U << 43); -/* Hole after PTA_MPX was removed. */ +const wide_int_bitmask PTA_NO_TUNE (HOST_WIDE_INT_1U << 44); const wide_int_bitmask PTA_SHA (HOST_WIDE_INT_1U << 45); const wide_int_bitmask PTA_PREFETCHWT1 (HOST_WIDE_INT_1U << 46); const wide_int_bitmask PTA_CLFLUSHOPT (HOST_WIDE_INT_1U << 47); @@ -2466,6 +2480,21 @@ const wide_int_bitmask PTA_ENQCMD (0, HOST_WIDE_INT_1U << 15); const wide_int_bitmask PTA_CLDEMOTE (0, HOST_WIDE_INT_1U << 16); const wide_int_bitmask PTA_SERIALIZE (0, HOST_WIDE_INT_1U << 17); const wide_int_bitmask PTA_TSXLDTRK (0, HOST_WIDE_INT_1U << 18); +const wide_int_bitmask PTA_AMX_TILE(0, HOST_WIDE_INT_1U << 19); +const wide_int_bitmask PTA_AMX_INT8(0, HOST_WIDE_INT_1U << 20); +const wide_int_bitmask PTA_AMX_BF16(0, HOST_WIDE_INT_1U << 21); +const wide_int_bitmask PTA_UINTR (0, HOST_WIDE_INT_1U << 22); +const wide_int_bitmask PTA_HRESET(0, HOST_WIDE_INT_1U << 23); + +const wide_int_bitmask PTA_X86_64_BASELINE = PTA_64BIT | PTA_MMX | PTA_SSE + | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR; +const wide_int_bitmask PTA_X86_64_V2 = (PTA_X86_64_BASELINE & (~PTA_NO_SAHF)) + | PTA_CX16 | PTA_POPCNT | PTA_SSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_SSSE3; +const wide_int_bitmask PTA_X86_64_V3 = PTA_X86_64_V2 + | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_LZCNT + | PTA_MOVBE | PTA_XSAVE; +const wide_int_bitmask PTA_X86_64_V4 = PTA_X86_64_V3 + | PTA_AVX512F | PTA_AVX512BW | PTA_AVX512CD | PTA_AVX512DQ | PTA_AVX512VL; const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR; @@ -2499,9 +2528,10 @@ const wide_int_bitmask PTA_TIGERLAKE = PTA_ICELAKE_CLIENT | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_CLWB | PTA_AVX512VP2INTERSECT; const wide_int_bitmask PTA_SAPPHIRERAPIDS = PTA_COOPERLAKE | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_ENQCMD | PTA_CLDEMOTE - | PTA_PTWRITE | PTA_WAITPKG | PTA_SERIALIZE | PTA_TSXLDTRK; + | PTA_PTWRITE | PTA_WAITPKG | PTA_SERIALIZE | PTA_TSXLDTRK | PTA_AMX_TILE + | PTA_AMX_INT8 | PTA_AMX_BF16 | PTA_UINTR; const wide_int_bitmask PTA_ALDERLAKE = PTA_SKYLAKE | PTA_CLDEMOTE | PTA_PTWRITE - | PTA_WAITPKG | PTA_SERIALIZE; + | PTA_WAITPKG | PTA_SERIALIZE | PTA_HRESET; const wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD; const wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 93aae81..8730816 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -186,6 +186,10 @@ ;; IRET support UNSPEC_INTERRUPT_RETURN + + ;; For MOVDIRI and MOVDIR64B support + UNSPEC_MOVDIRI + UNSPEC_MOVDIR64B ]) (define_c_enum "unspecv" [ @@ -280,10 +284,6 @@ UNSPECV_SETSSBSY UNSPECV_CLRSSBSY - ;; For MOVDIRI and MOVDIR64B support - UNSPECV_MOVDIRI - UNSPECV_MOVDIR64B - ;; For TSXLDTRK support UNSPECV_XSUSLDTRK UNSPECV_XRESLDTRK @@ -293,6 +293,12 @@ UNSPECV_UMONITOR UNSPECV_TPAUSE + ;; For UINTR support + UNSPECV_CLUI + UNSPECV_STUI + UNSPECV_TESTUI + UNSPECV_SENDUIPI + ;; For CLDEMOTE support UNSPECV_CLDEMOTE @@ -310,6 +316,9 @@ ;; For patchable area support UNSPECV_PATCHABLE_AREA + + ;; For HRESET support + UNSPECV_HRESET ]) ;; Constants to represent rounding modes in the ROUND instruction @@ -7039,6 +7048,20 @@ (set (match_operand:SWI48 0 "register_operand") (minus:SWI48 (match_dup 1) (match_dup 2)))])] "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)") + +(define_mode_iterator CC_CCC [CC CCC]) + +;; Pre-reload splitter to optimize +;; *setcc_qi followed by *addqi3_cconly_overflow_1 with the same QI +;; operand and no intervening flags modifications into nothing. +(define_insn_and_split "*setcc_qi_addqi3_cconly_overflow_1_<mode>" + [(set (reg:CCC FLAGS_REG) + (compare:CCC (neg:QI (geu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))) + (ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))))] + "ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)]) ;; Overflow setting add instructions @@ -13666,7 +13689,7 @@ (unspec [(const_int 0)] UNSPEC_INTERRUPT_RETURN)] "reload_completed" { - return TARGET_64BIT ? "iretq" : "iret"; + return TARGET_64BIT ? (TARGET_UINTR ? "uiret" : "iretq") : "iret"; }) ;; Used by x86_machine_dependent_reorg to avoid penalty on single byte RET @@ -21531,17 +21554,17 @@ ;; MOVDIRI and MOVDIR64B (define_insn "movdiri<mode>" - [(unspec_volatile:SWI48 [(match_operand:SWI48 0 "memory_operand" "m") - (match_operand:SWI48 1 "register_operand" "r")] - UNSPECV_MOVDIRI)] + [(set (match_operand:SWI48 0 "memory_operand" "=m") + (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")] + UNSPEC_MOVDIRI))] "TARGET_MOVDIRI" "movdiri\t{%1, %0|%0, %1}" [(set_attr "type" "other")]) (define_insn "@movdir64b_<mode>" - [(unspec_volatile:XI [(match_operand:P 0 "register_operand" "r") - (match_operand:XI 1 "memory_operand")] - UNSPECV_MOVDIR64B)] + [(set (mem:XI (match_operand:P 0 "register_operand" "r")) + (unspec:XI [(match_operand:XI 1 "memory_operand" "m")] + UNSPEC_MOVDIR64B))] "TARGET_MOVDIR64B" "movdir64b\t{%1, %0|%0, %1}" [(set_attr "type" "other")]) @@ -21571,6 +21594,34 @@ "enqcmd<enqcmd_sfx>\t{%1, %0|%0, %1}" [(set_attr "type" "other")]) +;; UINTR +(define_int_iterator UINTR [UNSPECV_CLUI UNSPECV_STUI]) +(define_int_attr uintr [(UNSPECV_CLUI "clui") (UNSPECV_STUI "stui")]) + +(define_insn "<uintr>" + [(unspec_volatile [(const_int 0)] UINTR)] + "TARGET_UINTR && TARGET_64BIT" + "<uintr>" + [(set_attr "type" "other") + (set_attr "length" "4")]) + +(define_insn "testui" + [(set (reg:CCC FLAGS_REG) + (unspec_volatile:CCC [(const_int 0)] UNSPECV_TESTUI))] + "TARGET_UINTR && TARGET_64BIT" + "testui" + [(set_attr "type" "other") + (set_attr "length" "4")]) + +(define_insn "senduipi" + [(unspec_volatile + [(match_operand:DI 0 "register_operand" "r")] + UNSPECV_SENDUIPI)] + "TARGET_UINTR && TARGET_64BIT" + "senduipi\t%0" + [(set_attr "type" "other") + (set_attr "length" "4")]) + ;; WAITPKG (define_insn "umwait" @@ -21655,6 +21706,14 @@ (set_attr "length_immediate" "0") (set_attr "modrm" "0")]) +(define_insn "hreset" + [(unspec_volatile [(match_operand:SI 0 "register_operand" "a")] + UNSPECV_HRESET)] + "TARGET_HRESET" + "hreset\t{$0|0}" + [(set_attr "type" "other") + (set_attr "length" "4")]) + (include "mmx.md") (include "sse.md") (include "sync.md") diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index c9f7195..e6b1695 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -788,6 +788,10 @@ mptwrite Target Report Mask(ISA2_PTWRITE) Var(ix86_isa_flags2) Save Support PTWRITE built-in functions and code generation. +muintr +Target Report Mask(ISA2_UINTR) Var(ix86_isa_flags2) Save +Support UINTR built-in functions and code generation. + msgx Target Report Mask(ISA2_SGX) Var(ix86_isa_flags2) Save Support SGX built-in functions and code generation. @@ -1114,4 +1118,20 @@ Support SERIALIZE built-in functions and code generation. mtsxldtrk Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save -Support TSXLDTRK built-in functions and code generation.
\ No newline at end of file +Support TSXLDTRK built-in functions and code generation. + +mamx-tile +Target Report Mask(ISA2_AMX_TILE) Var(ix86_isa_flags2) Save +Support AMX-TILE built-in functions and code generation. + +mamx-int8 +Target Report Mask(ISA2_AMX_INT8) Var(ix86_isa_flags2) Save +Support AMX-INT8 built-in functions and code generation. + +mamx-bf16 +Target Report Mask(ISA2_AMX_BF16) Var(ix86_isa_flags2) Save +Support AMX-BF16 built-in functions and code generation. + +mhreset +Target Report Mask(ISA2_HRESET) Var(ix86_isa_flags2) Save +Support HRESET built-in functions and code generation. diff --git a/gcc/config/i386/ia32intrin.h b/gcc/config/i386/ia32intrin.h index fd29797..3568d1f 100644 --- a/gcc/config/i386/ia32intrin.h +++ b/gcc/config/i386/ia32intrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef _X86INTRIN_H_INCLUDED -# error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <ia32intrin.h> directly; include <x86gprintrin.h> instead." #endif /* 32bit bsf */ diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index b660d0d..71eae83 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -24,6 +24,8 @@ #ifndef _IMMINTRIN_H_INCLUDED #define _IMMINTRIN_H_INCLUDED +#include <x86gprintrin.h> + #include <mmintrin.h> #include <xmmintrin.h> @@ -38,16 +40,6 @@ #include <wmmintrin.h> -#include <fxsrintrin.h> - -#include <xsaveintrin.h> - -#include <xsaveoptintrin.h> - -#include <xsavesintrin.h> - -#include <xsavecintrin.h> - #include <avxintrin.h> #include <avx2intrin.h> @@ -102,217 +94,28 @@ #include <shaintrin.h> -#include <lzcntintrin.h> - -#include <bmiintrin.h> - -#include <bmi2intrin.h> - #include <fmaintrin.h> #include <f16cintrin.h> #include <rtmintrin.h> -#include <xtestintrin.h> - -#include <cetintrin.h> - #include <gfniintrin.h> #include <vaesintrin.h> #include <vpclmulqdqintrin.h> -#include <movdirintrin.h> - -#include <sgxintrin.h> - -#include <pconfigintrin.h> - -#include <waitpkgintrin.h> - -#include <cldemoteintrin.h> - #include <avx512bf16vlintrin.h> #include <avx512bf16intrin.h> -#include <enqcmdintrin.h> +#include <amxtileintrin.h> -#include <serializeintrin.h> +#include <amxint8intrin.h> -#include <tsxldtrkintrin.h> - -#include <rdseedintrin.h> +#include <amxbf16intrin.h> #include <prfchwintrin.h> -#include <adxintrin.h> - -#include <clwbintrin.h> - -#include <clflushoptintrin.h> - -#include <wbnoinvdintrin.h> - -#include <pkuintrin.h> - -extern __inline void -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_wbinvd (void) -{ - __builtin_ia32_wbinvd (); -} - -#ifndef __RDRND__ -#pragma GCC push_options -#pragma GCC target("rdrnd") -#define __DISABLE_RDRND__ -#endif /* __RDRND__ */ -extern __inline int -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_rdrand16_step (unsigned short *__P) -{ - return __builtin_ia32_rdrand16_step (__P); -} - -extern __inline int -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_rdrand32_step (unsigned int *__P) -{ - return __builtin_ia32_rdrand32_step (__P); -} -#ifdef __DISABLE_RDRND__ -#undef __DISABLE_RDRND__ -#pragma GCC pop_options -#endif /* __DISABLE_RDRND__ */ - -#ifndef __RDPID__ -#pragma GCC push_options -#pragma GCC target("rdpid") -#define __DISABLE_RDPID__ -#endif /* __RDPID__ */ -extern __inline unsigned int -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_rdpid_u32 (void) -{ - return __builtin_ia32_rdpid (); -} -#ifdef __DISABLE_RDPID__ -#undef __DISABLE_RDPID__ -#pragma GCC pop_options -#endif /* __DISABLE_RDPID__ */ - -#ifdef __x86_64__ - -#ifndef __FSGSBASE__ -#pragma GCC push_options -#pragma GCC target("fsgsbase") -#define __DISABLE_FSGSBASE__ -#endif /* __FSGSBASE__ */ -extern __inline unsigned int -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_readfsbase_u32 (void) -{ - return __builtin_ia32_rdfsbase32 (); -} - -extern __inline unsigned long long -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_readfsbase_u64 (void) -{ - return __builtin_ia32_rdfsbase64 (); -} - -extern __inline unsigned int -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_readgsbase_u32 (void) -{ - return __builtin_ia32_rdgsbase32 (); -} - -extern __inline unsigned long long -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_readgsbase_u64 (void) -{ - return __builtin_ia32_rdgsbase64 (); -} - -extern __inline void -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_writefsbase_u32 (unsigned int __B) -{ - __builtin_ia32_wrfsbase32 (__B); -} - -extern __inline void -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_writefsbase_u64 (unsigned long long __B) -{ - __builtin_ia32_wrfsbase64 (__B); -} - -extern __inline void -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_writegsbase_u32 (unsigned int __B) -{ - __builtin_ia32_wrgsbase32 (__B); -} - -extern __inline void -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_writegsbase_u64 (unsigned long long __B) -{ - __builtin_ia32_wrgsbase64 (__B); -} -#ifdef __DISABLE_FSGSBASE__ -#undef __DISABLE_FSGSBASE__ -#pragma GCC pop_options -#endif /* __DISABLE_FSGSBASE__ */ - -#ifndef __RDRND__ -#pragma GCC push_options -#pragma GCC target("rdrnd") -#define __DISABLE_RDRND__ -#endif /* __RDRND__ */ -extern __inline int -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_rdrand64_step (unsigned long long *__P) -{ - return __builtin_ia32_rdrand64_step (__P); -} -#ifdef __DISABLE_RDRND__ -#undef __DISABLE_RDRND__ -#pragma GCC pop_options -#endif /* __DISABLE_RDRND__ */ - -#endif /* __x86_64__ */ - -#ifndef __PTWRITE__ -#pragma GCC push_options -#pragma GCC target("ptwrite") -#define __DISABLE_PTWRITE__ -#endif - -#ifdef __x86_64__ -extern __inline void -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_ptwrite64 (unsigned long long __B) -{ - __builtin_ia32_ptwrite64 (__B); -} -#endif /* __x86_64__ */ - -extern __inline void -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_ptwrite32 (unsigned __B) -{ - __builtin_ia32_ptwrite32 (__B); -} -#ifdef __DISABLE_PTWRITE__ -#undef __DISABLE_PTWRITE__ -#pragma GCC pop_options -#endif /* __DISABLE_PTWRITE__ */ - #endif /* _IMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/intelmic-mkoffload.c b/gcc/config/i386/intelmic-mkoffload.c index 15b5c3d..668208c 100644 --- a/gcc/config/i386/intelmic-mkoffload.c +++ b/gcc/config/i386/intelmic-mkoffload.c @@ -231,7 +231,7 @@ compile_for_target (struct obstack *argv_obstack) unsetenv ("LIBRARY_PATH"); unsetenv ("LD_RUN_PATH"); - fork_execute (argv[0], argv, false); + fork_execute (argv[0], argv, false, NULL); obstack_free (argv_obstack, NULL); /* Restore environment variables. */ @@ -455,7 +455,7 @@ generate_host_descr_file (const char *host_compiler) obstack_ptr_grow (&argv_obstack, NULL); char **argv = XOBFINISH (&argv_obstack, char **); - fork_execute (argv[0], argv, false); + fork_execute (argv[0], argv, false, NULL); obstack_free (&argv_obstack, NULL); return obj_filename; @@ -538,7 +538,7 @@ prepare_target_image (const char *target_compiler, int argc, char **argv) obstack_ptr_grow (&argv_obstack, rename_section_opt); obstack_ptr_grow (&argv_obstack, NULL); char **new_argv = XOBFINISH (&argv_obstack, char **); - fork_execute (new_argv[0], new_argv, false); + fork_execute (new_argv[0], new_argv, false, NULL); obstack_free (&argv_obstack, NULL); /* Objcopy has created symbols, containing the input file name with @@ -580,7 +580,7 @@ prepare_target_image (const char *target_compiler, int argc, char **argv) obstack_ptr_grow (&argv_obstack, opt_for_objcopy[2]); obstack_ptr_grow (&argv_obstack, NULL); new_argv = XOBFINISH (&argv_obstack, char **); - fork_execute (new_argv[0], new_argv, false); + fork_execute (new_argv[0], new_argv, false, NULL); obstack_free (&argv_obstack, NULL); return target_so_filename; @@ -672,7 +672,7 @@ main (int argc, char **argv) obstack_ptr_grow (&argv_obstack, out_obj_filename); obstack_ptr_grow (&argv_obstack, NULL); char **new_argv = XOBFINISH (&argv_obstack, char **); - fork_execute (new_argv[0], new_argv, false); + fork_execute (new_argv[0], new_argv, false, NULL); obstack_free (&argv_obstack, NULL); /* Run objcopy on the resultant object file to localize generated symbols @@ -688,7 +688,7 @@ main (int argc, char **argv) obstack_ptr_grow (&argv_obstack, out_obj_filename); obstack_ptr_grow (&argv_obstack, NULL); new_argv = XOBFINISH (&argv_obstack, char **); - fork_execute (new_argv[0], new_argv, false); + fork_execute (new_argv[0], new_argv, false, NULL); obstack_free (&argv_obstack, NULL); return 0; diff --git a/gcc/config/i386/lwpintrin.h b/gcc/config/i386/lwpintrin.h index d7c3acb..0b5c8bb 100644 --- a/gcc/config/i386/lwpintrin.h +++ b/gcc/config/i386/lwpintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef _X86INTRIN_H_INCLUDED -# error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <lwpintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _LWPINTRIN_H_INCLUDED diff --git a/gcc/config/i386/lzcntintrin.h b/gcc/config/i386/lzcntintrin.h index 1863a58..6d00e9f 100644 --- a/gcc/config/i386/lzcntintrin.h +++ b/gcc/config/i386/lzcntintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED -# error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <lzcntintrin.h> directly; include <x86gprintrin.h> instead." #endif diff --git a/gcc/config/i386/mingw-w64.h b/gcc/config/i386/mingw-w64.h index 408e57c..0d0aa93 100644 --- a/gcc/config/i386/mingw-w64.h +++ b/gcc/config/i386/mingw-w64.h @@ -98,3 +98,9 @@ along with GCC; see the file COPYING3. If not see %{shared|mdll: " SUB_LINK_ENTRY " --enable-auto-image-base} \ " LINK_SPEC_LARGE_ADDR_AWARE "\ %(shared_libgcc_undefs)" + +/* Enable sincos optimization, overriding cygming.h. sincos, sincosf + and sincosl functions are available on mingw-w64, but not on the + original mingw32. */ +#undef TARGET_LIBC_HAS_FUNCTION +#define TARGET_LIBC_HAS_FUNCTION gnu_libc_has_function diff --git a/gcc/config/i386/mmintrin.h b/gcc/config/i386/mmintrin.h index 77de7ca..dff42fd 100644 --- a/gcc/config/i386/mmintrin.h +++ b/gcc/config/i386/mmintrin.h @@ -42,9 +42,15 @@ /* The Intel API is flexible enough that we must allow aliasing with other vector types, and their scalar components. */ typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); +typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__)); +typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__)); /* Unaligned version of the same type */ typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1))); +typedef int __m32_u __attribute__ ((__vector_size__ (4), \ + __may_alias__, __aligned__ (1))); +typedef short __m16_u __attribute__ ((__vector_size__ (2), \ + __may_alias__, __aligned__ (1))); /* Internal data types for implementing the intrinsics. */ typedef int __v2si __attribute__ ((__vector_size__ (8))); diff --git a/gcc/config/i386/movdirintrin.h b/gcc/config/i386/movdirintrin.h index e7f374a..b2f8406 100644 --- a/gcc/config/i386/movdirintrin.h +++ b/gcc/config/i386/movdirintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <movdirintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _MOVDIRINTRIN_H_INCLUDED diff --git a/gcc/config/i386/pconfigintrin.h b/gcc/config/i386/pconfigintrin.h index d2a3261..31c493a 100644 --- a/gcc/config/i386/pconfigintrin.h +++ b/gcc/config/i386/pconfigintrin.h @@ -1,5 +1,28 @@ -#ifndef _IMMINTRIN_H_INCLUDED -#error "Never use <pconfigintrin.h> directly; include <immintrin.h> instead." +/* Copyright (C) 2018-2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <pconfigintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _PCONFIGINTRIN_H_INCLUDED diff --git a/gcc/config/i386/pkuintrin.h b/gcc/config/i386/pkuintrin.h index 6840914..0d2dd51 100644 --- a/gcc/config/i386/pkuintrin.h +++ b/gcc/config/i386/pkuintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <pkuintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <pkuintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _PKUINTRIN_H_INCLUDED diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index b03f9cd..be57cda 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1081,6 +1081,19 @@ (ior (match_operand 0 "register_operand") (match_operand 0 "vector_memory_operand"))) +(define_predicate "bcst_mem_operand" + (and (match_code "vec_duplicate") + (and (match_test "TARGET_AVX512F") + (ior (match_test "TARGET_AVX512VL") + (match_test "GET_MODE_SIZE (GET_MODE (op)) == 64"))) + (match_test "VALID_BCST_MODE_P (GET_MODE_INNER (GET_MODE (op)))") + (match_test "memory_operand (XEXP (op, 0), GET_MODE (XEXP (op, 0)))"))) + +; Return true when OP is bcst_mem_operand or vector_memory_operand. +(define_predicate "bcst_vector_operand" + (ior (match_operand 0 "vector_operand") + (match_operand 0 "bcst_mem_operand"))) + ;; Return true when OP is either nonimmediate operand, or any ;; CONST_VECTOR. (define_predicate "nonimmediate_or_const_vector_operand" diff --git a/gcc/config/i386/rdseedintrin.h b/gcc/config/i386/rdseedintrin.h index efc7cea..168053a 100644 --- a/gcc/config/i386/rdseedintrin.h +++ b/gcc/config/i386/rdseedintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <rdseedintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <rdseedintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _RDSEEDINTRIN_H_INCLUDED diff --git a/gcc/config/i386/rtmintrin.h b/gcc/config/i386/rtmintrin.h index 463a989..436e517 100644 --- a/gcc/config/i386/rtmintrin.h +++ b/gcc/config/i386/rtmintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef _IMMINTRIN_H_INCLUDED -# error "Never use <rtmintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <rtmintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _RTMINTRIN_H_INCLUDED diff --git a/gcc/config/i386/serializeintrin.h b/gcc/config/i386/serializeintrin.h index 0c35b9e..95f26d6 100644 --- a/gcc/config/i386/serializeintrin.h +++ b/gcc/config/i386/serializeintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <serializeintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <serializeintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _SERIALIZE_H_INCLUDED diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 934b60a..52635f6 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1756,8 +1756,8 @@ (define_insn "*<plusminus_insn><mode>3<mask_name><round_name>" [(set (match_operand:VF 0 "register_operand" "=x,v") (plusminus:VF - (match_operand:VF 1 "<round_nimm_predicate>" "<comm>0,v") - (match_operand:VF 2 "<round_nimm_predicate>" "xBm,<round_constraint>")))] + (match_operand:VF 1 "<bcst_round_nimm_predicate>" "<comm>0,v") + (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ @@ -1765,35 +1765,7 @@ v<plusminus_mnemonic><ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sseadd") - (set_attr "prefix" "<mask_prefix3>") - (set_attr "mode" "<MODE>")]) - -(define_insn "*sub<mode>3<mask_name>_bcst" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v") - (minus:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "v") - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 2 "memory_operand" "m"))))] - "TARGET_AVX512F - && ix86_binary_operator_ok (MINUS, <MODE>mode, operands) - && <mask_mode512bit_condition>" - "vsub<ssemodesuffix>\t{%2<avx512bcst>, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<avx512bcst>}" - [(set_attr "prefix" "evex") - (set_attr "type" "sseadd") - (set_attr "mode" "<MODE>")]) - -(define_insn "*add<mode>3<mask_name>_bcst" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v") - (plus:VF_AVX512 - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 1 "memory_operand" "m")) - (match_operand:VF_AVX512 2 "register_operand" "v")))] - "TARGET_AVX512F - && ix86_binary_operator_ok (PLUS, <MODE>mode, operands) - && <mask_mode512bit_condition>" - "vadd<ssemodesuffix>\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}" - [(set_attr "prefix" "evex") - (set_attr "type" "sseadd") + (set_attr "prefix" "<bcst_mask_prefix3>") (set_attr "mode" "<MODE>")]) ;; Standard scalar operation patterns which preserve the rest of the @@ -1846,32 +1818,19 @@ (define_insn "*mul<mode>3<mask_name><round_name>" [(set (match_operand:VF 0 "register_operand" "=x,v") (mult:VF - (match_operand:VF 1 "<round_nimm_predicate>" "%0,v") - (match_operand:VF 2 "<round_nimm_predicate>" "xBm,<round_constraint>")))] - "TARGET_SSE - && !(MEM_P (operands[1]) && MEM_P (operands[2])) + (match_operand:VF 1 "<bcst_round_nimm_predicate>" "%0,v") + (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] + "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands) && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ mul<ssemodesuffix>\t{%2, %0|%0, %2} vmul<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "ssemul") - (set_attr "prefix" "<mask_prefix3>") + (set_attr "prefix" "<bcst_mask_prefix3>") (set_attr "btver2_decode" "direct,double") (set_attr "mode" "<MODE>")]) -(define_insn "*mul<mode>3<mask_name>_bcst" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v") - (mult:VF_AVX512 - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 1 "memory_operand" "m")) - (match_operand:VF_AVX512 2 "register_operand" "v")))] - "TARGET_AVX512F && <mask_mode512bit_condition>" - "vmul<ssemodesuffix>\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}" - [(set_attr "prefix" "evex") - (set_attr "type" "ssemul") - (set_attr "mode" "<MODE>")]) - ;; Standard scalar operation patterns which preserve the rest of the ;; vector for combiner. (define_insn "*<sse>_vm<multdiv_mnemonic><mode>3" @@ -1943,26 +1902,14 @@ [(set (match_operand:VF 0 "register_operand" "=x,v") (div:VF (match_operand:VF 1 "register_operand" "0,v") - (match_operand:VF 2 "<round_nimm_predicate>" "xBm,<round_constraint>")))] + (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))] "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ div<ssemodesuffix>\t{%2, %0|%0, %2} vdiv<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "ssediv") - (set_attr "prefix" "<mask_prefix3>") - (set_attr "mode" "<MODE>")]) - -(define_insn "*<avx512>_div<mode>3<mask_name>_bcst" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v") - (div:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "v") - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 2 "memory_operand" "m"))))] - "TARGET_AVX512F && <mask_mode512bit_condition>" - "vdiv<ssemodesuffix>\t{%2<avx512bcst>, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<avx512bcst>}" - [(set_attr "prefix" "evex") - (set_attr "type" "ssediv") + (set_attr "prefix" "<bcst_mask_prefix3>") (set_attr "mode" "<MODE>")]) (define_insn "<sse>_rcp<mode>2" @@ -2861,30 +2808,30 @@ DONE; }) -(define_insn "<mask_codefor>reducep<mode><mask_name>" +(define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") (unspec:VF_AVX512VL - [(match_operand:VF_AVX512VL 1 "nonimmediate_operand" "vm") + [(match_operand:VF_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 2 "const_0_to_255_operand")] UNSPEC_REDUCE))] "TARGET_AVX512DQ" - "vreduce<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + "vreduce<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}" [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "reduces<mode><mask_scalar_name>" +(define_insn "reduces<mode><mask_scalar_name><round_saeonly_scalar_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "nonimmediate_operand" "vm") + (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>") (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_REDUCE) (match_dup 1) (const_int 1)))] "TARGET_AVX512DQ" - "vreduce<ssescalarmodesuffix>\t{%3, %2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2, %3}" + "vreduce<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}" [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) @@ -4055,9 +4002,9 @@ (define_insn "<sd_mask_codefor>fma_fmadd_<mode><sd_maskz_name><round_name>" [(set (match_operand:VF_SF_AVX512VL 0 "register_operand" "=v,v,v") (fma:VF_SF_AVX512VL - (match_operand:VF_SF_AVX512VL 1 "<round_nimm_predicate>" "%0,0,v") - (match_operand:VF_SF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>") - (match_operand:VF_SF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>,0")))] + (match_operand:VF_SF_AVX512VL 1 "<bcst_round_nimm_predicate>" "%0,0,v") + (match_operand:VF_SF_AVX512VL 2 "<bcst_round_nimm_predicate>" "<bcst_round_constraint>,v,<bcst_round_constraint>") + (match_operand:VF_SF_AVX512VL 3 "<bcst_round_nimm_predicate>" "v,<bcst_round_constraint>,0")))] "TARGET_AVX512F && <sd_mask_mode512bit_condition> && <round_mode512bit_condition>" "@ vfmadd132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} @@ -4066,46 +4013,6 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*<sd_mask_codefor>fma_fmadd_<mode><sd_maskz_name>_bcst_1" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v") - (fma:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "%0") - (match_operand:VF_AVX512 2 "register_operand" "v") - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 3 "memory_operand" "m"))))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "vfmadd213<ssemodesuffix>\t{%3<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - -(define_insn "*<sd_mask_codefor>fma_fmadd_<mode><sd_maskz_name>_bcst_2" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") - (fma:VF_AVX512 - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 1 "memory_operand" "m,m")) - (match_operand:VF_AVX512 2 "register_operand" "0,v") - (match_operand:VF_AVX512 3 "register_operand" "v,0")))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "@ - vfmadd132<ssemodesuffix>\t{%1<avx512bcst>, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %1<avx512bcst>} - vfmadd231<ssemodesuffix>\t{%1<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %1<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - -(define_insn "*<sd_mask_codefor>fma_fmadd_<mode><sd_maskz_name>_bcst_3" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") - (fma:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "0,v") - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 2 "memory_operand" "m,m")) - (match_operand:VF_AVX512 3 "register_operand" "v,0")))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "@ - vfmadd132<ssemodesuffix>\t{%2<avx512bcst>, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<avx512bcst>} - vfmadd231<ssemodesuffix>\t{%2<avx512bcst>, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - (define_insn "<avx512>_fmadd_<mode>_mask<round_name>" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VF_AVX512VL @@ -4171,10 +4078,10 @@ (define_insn "<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name><round_name>" [(set (match_operand:VF_SF_AVX512VL 0 "register_operand" "=v,v,v") (fma:VF_SF_AVX512VL - (match_operand:VF_SF_AVX512VL 1 "<round_nimm_predicate>" "%0,0,v") - (match_operand:VF_SF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>") + (match_operand:VF_SF_AVX512VL 1 "<bcst_round_nimm_predicate>" "%0,0,v") + (match_operand:VF_SF_AVX512VL 2 "<bcst_round_nimm_predicate>" "<bcst_round_constraint>,v,<bcst_round_constraint>") (neg:VF_SF_AVX512VL - (match_operand:VF_SF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>,0"))))] + (match_operand:VF_SF_AVX512VL 3 "<bcst_round_nimm_predicate>" "v,<bcst_round_constraint>,0"))))] "TARGET_AVX512F && <sd_mask_mode512bit_condition> && <round_mode512bit_condition>" "@ vfmsub132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} @@ -4183,49 +4090,6 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_1" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v") - (fma:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "%0") - (match_operand:VF_AVX512 2 "register_operand" "v") - (neg:VF_AVX512 - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 3 "memory_operand" "m")))))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "vfmsub213<ssemodesuffix>\t{%3<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - -(define_insn "*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_2" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") - (fma:VF_AVX512 - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 1 "memory_operand" "m,m")) - (match_operand:VF_AVX512 2 "register_operand" "0,v") - (neg:VF_AVX512 - (match_operand:VF_AVX512 3 "register_operand" "v,0"))))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "@ - vfmsub132<ssemodesuffix>\t{%1<avx512bcst>, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %1<avx512bcst>} - vfmsub231<ssemodesuffix>\t{%1<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %1<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - -(define_insn "*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_3" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") - (fma:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "0,v") - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 2 "memory_operand" "m,m")) - (neg:VF_AVX512 - (match_operand:VF_AVX512 3 "nonimmediate_operand" "v,0"))))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "@ - vfmsub132<ssemodesuffix>\t{%2<avx512bcst>, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<avx512bcst>} - vfmsub231<ssemodesuffix>\t{%2<avx512bcst>, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - (define_insn "<avx512>_fmsub_<mode>_mask<round_name>" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VF_AVX512VL @@ -4294,9 +4158,9 @@ [(set (match_operand:VF_SF_AVX512VL 0 "register_operand" "=v,v,v") (fma:VF_SF_AVX512VL (neg:VF_SF_AVX512VL - (match_operand:VF_SF_AVX512VL 1 "<round_nimm_predicate>" "%0,0,v")) - (match_operand:VF_SF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>") - (match_operand:VF_SF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>,0")))] + (match_operand:VF_SF_AVX512VL 1 "<bcst_round_nimm_predicate>" "%0,0,v")) + (match_operand:VF_SF_AVX512VL 2 "<bcst_round_nimm_predicate>" "<bcst_round_constraint>,v,<bcst_round_constraint>") + (match_operand:VF_SF_AVX512VL 3 "<bcst_round_nimm_predicate>" "v,<bcst_round_constraint>,0")))] "TARGET_AVX512F && <sd_mask_mode512bit_condition> && <round_mode512bit_condition>" "@ vfnmadd132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} @@ -4305,49 +4169,6 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*<sd_mask_codefor>fma_fnmadd_<mode><sd_maskz_name>_bcst_1" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v") - (fma:VF_AVX512 - (neg:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "%0")) - (match_operand:VF_AVX512 2 "register_operand" "v") - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 3 "memory_operand" "m"))))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "vfnmadd213<ssemodesuffix>\t{%3<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - -(define_insn "*<sd_mask_codefor>fma_fnmadd_<mode><sd_maskz_name>_bcst_2" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") - (fma:VF_AVX512 - (neg:VF_AVX512 - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 1 "memory_operand" "m,m"))) - (match_operand:VF_AVX512 2 "register_operand" "0,v") - (match_operand:VF_AVX512 3 "register_operand" "v,0")))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "@ - vfnmadd132<ssemodesuffix>\t{%1<avx512bcst>, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %1<avx512bcst>} - vfnmadd231<ssemodesuffix>\t{%1<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %1<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - -(define_insn "*<sd_mask_codefor>fma_fnmadd_<mode><sd_maskz_name>_bcst_3" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") - (fma:VF_AVX512 - (neg:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "0,v")) - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 2 "memory_operand" "m,m")) - (match_operand:VF_AVX512 3 "register_operand" "v,0")))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "@ - vfnmadd132<ssemodesuffix>\t{%2<avx512bcst>, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<avx512bcst>} - vfnmadd231<ssemodesuffix>\t{%2<avx512bcst>, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - (define_insn "<avx512>_fnmadd_<mode>_mask<round_name>" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VF_AVX512VL @@ -4417,10 +4238,10 @@ [(set (match_operand:VF_SF_AVX512VL 0 "register_operand" "=v,v,v") (fma:VF_SF_AVX512VL (neg:VF_SF_AVX512VL - (match_operand:VF_SF_AVX512VL 1 "<round_nimm_predicate>" "%0,0,v")) - (match_operand:VF_SF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>") + (match_operand:VF_SF_AVX512VL 1 "<bcst_round_nimm_predicate>" "%0,0,v")) + (match_operand:VF_SF_AVX512VL 2 "<bcst_round_nimm_predicate>" "<bcst_round_constraint>,v,<bcst_round_constraint>") (neg:VF_SF_AVX512VL - (match_operand:VF_SF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>,0"))))] + (match_operand:VF_SF_AVX512VL 3 "<bcst_round_nimm_predicate>" "v,<bcst_round_constraint>,0"))))] "TARGET_AVX512F && <sd_mask_mode512bit_condition> && <round_mode512bit_condition>" "@ vfnmsub132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} @@ -4429,52 +4250,6 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*<sd_mask_codefor>fma_fnmsub_<mode><sd_maskz_name>_bcst_1" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v") - (fma:VF_AVX512 - (neg:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "%0")) - (match_operand:VF_AVX512 2 "register_operand" "v") - (neg:VF_AVX512 - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 3 "memory_operand" "m")))))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "vfnmsub213<ssemodesuffix>\t{%3<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - -(define_insn "*<sd_mask_codefor>fma_fnmsub_<mode><sd_maskz_name>_bcst_2" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") - (fma:VF_AVX512 - (neg:VF_AVX512 - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 1 "memory_operand" "m,m"))) - (match_operand:VF_AVX512 2 "register_operand" "0,v") - (neg:VF_AVX512 - (match_operand:VF_AVX512 3 "register_operand" "v,0"))))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "@ - vfnmsub132<ssemodesuffix>\t{%1<avx512bcst>, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %1<avx512bcst>} - vfnmsub231<ssemodesuffix>\t{%1<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %1<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - -(define_insn "*<sd_mask_codefor>fma_fnmsub_<mode><sd_maskz_name>_bcst_3" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") - (fma:VF_AVX512 - (neg:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "0,v")) - (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 2 "memory_operand" "m,m")) - (neg:VF_AVX512 - (match_operand:VF_AVX512 3 "register_operand" "v,0"))))] - "TARGET_AVX512F && <sd_mask_mode512bit_condition>" - "@ - vfnmsub132<ssemodesuffix>\t{%2<avx512bcst>, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<avx512bcst>} - vfnmsub231<ssemodesuffix>\t{%2<avx512bcst>, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<avx512bcst>}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "<MODE>")]) - (define_insn "<avx512>_fnmsub_<mode>_mask<round_name>" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VF_AVX512VL @@ -6374,7 +6149,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn "sse2_cvtsd2ss<round_name>" +(define_insn "sse2_cvtsd2ss<mask_name><round_name>" [(set (match_operand:V4SF 0 "register_operand" "=x,x,v") (vec_merge:V4SF (vec_duplicate:V4SF @@ -6386,7 +6161,7 @@ "@ cvtsd2ss\t{%2, %0|%0, %2} cvtsd2ss\t{%2, %0|%0, %q2} - vcvtsd2ss\t{<round_op3>%2, %1, %0|%0, %1, %q2<round_op3>}" + vcvtsd2ss\t{<round_mask_op3>%2, %1, %0<mask_operand3>|<mask_operand3>%0, %1, %q2<round_mask_op3>}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssecvt") (set_attr "athlon_decode" "vector,double,*") @@ -6417,7 +6192,7 @@ (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "SF")]) -(define_insn "sse2_cvtss2sd<round_saeonly_name>" +(define_insn "sse2_cvtss2sd<mask_name><round_saeonly_name>" [(set (match_operand:V2DF 0 "register_operand" "=x,x,v") (vec_merge:V2DF (float_extend:V2DF @@ -6430,7 +6205,7 @@ "@ cvtss2sd\t{%2, %0|%0, %2} cvtss2sd\t{%2, %0|%0, %k2} - vcvtss2sd\t{<round_saeonly_op3>%2, %1, %0|%0, %1, %k2<round_saeonly_op3>}" + vcvtss2sd\t{<round_saeonly_mask_op3>%2, %1, %0<mask_operand3>|<mask_operand3>%0, %1, %k2<round_saeonly_mask_op3>}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssecvt") (set_attr "amdfam10_decode" "vector,double,*") @@ -11563,8 +11338,8 @@ (define_insn "*<plusminus_insn><mode>3" [(set (match_operand:VI_AVX2 0 "register_operand" "=x,v") (plusminus:VI_AVX2 - (match_operand:VI_AVX2 1 "vector_operand" "<comm>0,v") - (match_operand:VI_AVX2 2 "vector_operand" "xBm,vm")))] + (match_operand:VI_AVX2 1 "bcst_vector_operand" "<comm>0,v") + (match_operand:VI_AVX2 2 "bcst_vector_operand" "xBm,vmBr")))] "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" "@ p<plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2} @@ -11572,31 +11347,7 @@ [(set_attr "isa" "noavx,avx") (set_attr "type" "sseiadd") (set_attr "prefix_data16" "1,*") - (set_attr "prefix" "orig,vex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "*sub<mode>3_bcst" - [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") - (minus:VI48_AVX512VL - (match_operand:VI48_AVX512VL 1 "register_operand" "v") - (vec_duplicate:VI48_AVX512VL - (match_operand:<ssescalarmode> 2 "memory_operand" "m"))))] - "TARGET_AVX512F && ix86_binary_operator_ok (MINUS, <MODE>mode, operands)" - "vpsub<ssemodesuffix>\t{%2<avx512bcst>, %1, %0|%0, %1, %2<avx512bcst>}" - [(set_attr "type" "sseiadd") - (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "*add<mode>3_bcst" - [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") - (plus:VI48_AVX512VL - (vec_duplicate:VI48_AVX512VL - (match_operand:<ssescalarmode> 1 "memory_operand" "m")) - (match_operand:VI48_AVX512VL 2 "register_operand" "v")))] - "TARGET_AVX512F && ix86_binary_operator_ok (PLUS, <MODE>mode, operands)" - "vpadd<ssemodesuffix>\t{%1<avx512bcst>, %2, %0|%0, %2, %1<avx512bcst>}" - [(set_attr "type" "sseiadd") - (set_attr "prefix" "evex") + (set_attr "prefix" "orig,maybe_evex") (set_attr "mode" "<sseinsnmode>")]) (define_insn "*<plusminus_insn><mode>3_mask" @@ -12110,24 +11861,13 @@ (set_attr "mode" "TI")]) (define_insn "avx512dq_mul<mode>3<mask_name>" - [(set (match_operand:VI8 0 "register_operand" "=v") - (mult:VI8 - (match_operand:VI8 1 "register_operand" "v") - (match_operand:VI8 2 "nonimmediate_operand" "vm")))] - "TARGET_AVX512DQ && <mask_mode512bit_condition>" - "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" - [(set_attr "type" "sseimul") - (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "*avx512dq_mul<mode>3<mask_name>_bcst" [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v") (mult:VI8_AVX512VL - (vec_duplicate:VI8_AVX512VL - (match_operand:<ssescalarmode> 1 "memory_operand" "m")) - (match_operand:VI8_AVX512VL 2 "register_operand" "v")))] - "TARGET_AVX512DQ" - "vpmullq\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}" + (match_operand:VI8_AVX512VL 1 "bcst_vector_operand" "%v") + (match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))] + "TARGET_AVX512DQ && <mask_mode512bit_condition> + && ix86_binary_operator_ok (MULT, <MODE>mode, operands)" + "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "type" "sseimul") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -12157,10 +11897,10 @@ (define_insn "*<sse4_1_avx2>_mul<mode>3<mask_name>" [(set (match_operand:VI4_AVX512F 0 "register_operand" "=Yr,*x,v") (mult:VI4_AVX512F - (match_operand:VI4_AVX512F 1 "vector_operand" "%0,0,v") - (match_operand:VI4_AVX512F 2 "vector_operand" "YrBm,*xBm,vm")))] - "TARGET_SSE4_1 && !(MEM_P (operands[1]) && MEM_P (operands[2])) - && <mask_mode512bit_condition>" + (match_operand:VI4_AVX512F 1 "bcst_vector_operand" "%0,0,v") + (match_operand:VI4_AVX512F 2 "bcst_vector_operand" "YrBm,*xBm,vmBr")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, <MODE>mode, operands) + && <mask_mode512bit_condition>" "@ pmulld\t{%2, %0|%0, %2} pmulld\t{%2, %0|%0, %2} @@ -12168,22 +11908,10 @@ [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseimul") (set_attr "prefix_extra" "1") - (set_attr "prefix" "<mask_prefix4>") + (set_attr "prefix" "<bcst_mask_prefix4>") (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "*avx512f_mul<mode>3<mask_name>_bcst" - [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v") - (mult:VI4_AVX512VL - (vec_duplicate:VI4_AVX512VL - (match_operand:<ssescalarmode> 1 "memory_operand" "m")) - (match_operand:VI4_AVX512VL 2 "register_operand" "v")))] - "TARGET_AVX512F" - "vpmulld\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}" - [(set_attr "type" "sseimul") - (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) - (define_expand "mul<mode>3" [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand") (mult:VI8_AVX2_AVX512F @@ -13210,7 +12938,7 @@ [(set (match_operand:VI 0 "register_operand" "=x,x,v") (and:VI (not:VI (match_operand:VI 1 "register_operand" "0,x,v")) - (match_operand:VI 2 "vector_operand" "xBm,xm,vm")))] + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr")))] "TARGET_SSE" { char buf[64]; @@ -13309,19 +13037,6 @@ ] (const_string "<sseinsnmode>")))]) -(define_insn "*andnot<mode>3_bcst" - [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") - (and:VI48_AVX512VL - (not:VI48_AVX512VL - (match_operand:VI48_AVX512VL 1 "register_operand" "v")) - (vec_duplicate:VI48_AVX512VL - (match_operand:<ssescalarmode> 2 "memory_operand" "m"))))] - "TARGET_AVX512F" - "vpandn<ssemodesuffix>\t{%2<avx512bcst>, %1, %0|%0, %1, %2<avx512bcst>}" - [(set_attr "type" "sselog") - (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) - (define_insn "*andnot<mode>3_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (vec_merge:VI48_AVX512VL @@ -13351,10 +13066,10 @@ (define_insn "<mask_codefor><code><mode>3<mask_name>" [(set (match_operand:VI48_AVX_AVX512F 0 "register_operand" "=x,x,v") (any_logic:VI48_AVX_AVX512F - (match_operand:VI48_AVX_AVX512F 1 "vector_operand" "%0,x,v") - (match_operand:VI48_AVX_AVX512F 2 "vector_operand" "xBm,xm,vm")))] + (match_operand:VI48_AVX_AVX512F 1 "bcst_vector_operand" "%0,x,v") + (match_operand:VI48_AVX_AVX512F 2 "bcst_vector_operand" "xBm,xm,vmBr")))] "TARGET_SSE && <mask_mode512bit_condition> - && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" { char buf[64]; const char *ops; @@ -13540,18 +13255,6 @@ ] (const_string "<sseinsnmode>")))]) -(define_insn "*<code><mode>3_bcst" - [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") - (any_logic:VI48_AVX512VL - (vec_duplicate:VI48_AVX512VL - (match_operand:<ssescalarmode> 1 "memory_operand" "m")) - (match_operand:VI48_AVX512VL 2 "register_operand" "v")))] - "TARGET_AVX512F && <mask_avx512vl_condition>" - "vp<logic><ssemodesuffix>\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}" - [(set_attr "type" "sseiadd") - (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) - (define_mode_iterator VI1248_AVX512VLBW [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL && TARGET_AVX512BW") (V16QI "TARGET_AVX512VL && TARGET_AVX512BW") @@ -19092,7 +18795,7 @@ (set_attr "type" "sse") (set_attr "mode" "<MODE>")]) -(define_insn "avx512er_vmrcp28<mode><round_saeonly_name>" +(define_insn "avx512er_vmrcp28<mode><mask_name><round_saeonly_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 @@ -19101,7 +18804,7 @@ (match_operand:VF_128 2 "register_operand" "v") (const_int 1)))] "TARGET_AVX512ER" - "vrcp28<ssescalarmodesuffix>\t{<round_saeonly_op3>%1, %2, %0|%0, %2, %<iptr>1<round_saeonly_op3>}" + "vrcp28<ssescalarmodesuffix>\t{<round_saeonly_mask_op3>%1, %2, %0<mask_operand3>|<mask_opernad3>%0, %2, %<iptr>1<round_saeonly_mask_op3>}" [(set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "type" "sse") @@ -19118,7 +18821,7 @@ (set_attr "type" "sse") (set_attr "mode" "<MODE>")]) -(define_insn "avx512er_vmrsqrt28<mode><round_saeonly_name>" +(define_insn "avx512er_vmrsqrt28<mode><mask_name><round_saeonly_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 @@ -19127,7 +18830,7 @@ (match_operand:VF_128 2 "register_operand" "v") (const_int 1)))] "TARGET_AVX512ER" - "vrsqrt28<ssescalarmodesuffix>\t{<round_saeonly_op3>%1, %2, %0|%0, %2, %<iptr>1<round_saeonly_op3>}" + "vrsqrt28<ssescalarmodesuffix>\t{<round_saeonly_mask_op3>%1, %2, %0<mask_operand3>|<mask_operand3>%0, %2, %<iptr>1<round_saeonly_mask_op3>}" [(set_attr "length_immediate" "1") (set_attr "type" "sse") (set_attr "prefix" "evex") diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 58ea9dc..e037a96 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -60,7 +60,9 @@ (define_subst_attr "mask_prefix" "mask" "vex" "evex") (define_subst_attr "mask_prefix2" "mask" "maybe_vex" "evex") (define_subst_attr "mask_prefix3" "mask" "orig,vex" "evex,evex") +(define_subst_attr "bcst_mask_prefix3" "mask" "orig,maybe_evex" "evex,evex") (define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex") +(define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex") (define_subst_attr "mask_expand_op3" "mask" "3" "5") (define_subst "mask" @@ -130,9 +132,11 @@ (define_subst_attr "round_mask_op4" "round" "" "<round_mask_operand4>") (define_subst_attr "round_sd_mask_op4" "round" "" "<round_sd_mask_operand4>") (define_subst_attr "round_constraint" "round" "vm" "v") +(define_subst_attr "bcst_round_constraint" "round" "vmBr" "v") (define_subst_attr "round_constraint2" "round" "m" "v") (define_subst_attr "round_constraint3" "round" "rm" "r") (define_subst_attr "round_nimm_predicate" "round" "vector_operand" "register_operand") +(define_subst_attr "bcst_round_nimm_predicate" "round" "bcst_vector_operand" "register_operand") (define_subst_attr "round_nimm_scalar_predicate" "round" "nonimmediate_operand" "register_operand") (define_subst_attr "round_prefix" "round" "vex" "evex") (define_subst_attr "round_mode512bit_condition" "round" "1" "(<MODE>mode == V16SFmode diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386 index 5134e08..e5fb061 100644 --- a/gcc/config/i386/t-i386 +++ b/gcc/config/i386/t-i386 @@ -17,7 +17,8 @@ # <http://www.gnu.org/licenses/>. OPTIONS_H_EXTRA += $(srcdir)/config/i386/stringop.def -TM_H += $(srcdir)/config/i386/x86-tune.def +TM_H += $(srcdir)/config/i386/x86-tune.def \ + $(srcdir)/common/config/i386/i386-cpuinfo.h PASSES_EXTRA += $(srcdir)/config/i386/i386-passes.def i386-c.o: $(srcdir)/config/i386/i386-c.c diff --git a/gcc/config/i386/t-rtems b/gcc/config/i386/t-rtems index 7626970..5f078c6 100644 --- a/gcc/config/i386/t-rtems +++ b/gcc/config/i386/t-rtems @@ -17,10 +17,10 @@ # <http://www.gnu.org/licenses/>. # -MULTILIB_OPTIONS = mtune=i486/mtune=pentium/mtune=pentiumpro msoft-float +MULTILIB_OPTIONS = march=i486/march=pentium/march=pentiumpro msoft-float MULTILIB_DIRNAMES= m486 mpentium mpentiumpro soft-float MULTILIB_MATCHES = msoft-float=mno-80387 -MULTILIB_MATCHES += mtune?pentium=mtune?k6 mtune?pentiumpro=mtune?athlon +MULTILIB_MATCHES += march?pentium=march?k6 march?pentiumpro=march?athlon MULTILIB_EXCEPTIONS = \ -mtune=pentium/*msoft-float* \ -mtune=pentiumpro/*msoft-float* +march=pentium/*msoft-float* \ +march=pentiumpro/*msoft-float* diff --git a/gcc/config/i386/tbmintrin.h b/gcc/config/i386/tbmintrin.h index c8a9d77..e03bf91 100644 --- a/gcc/config/i386/tbmintrin.h +++ b/gcc/config/i386/tbmintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef _X86INTRIN_H_INCLUDED -# error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <tbmintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _TBMINTRIN_H_INCLUDED diff --git a/gcc/config/i386/tsxldtrkintrin.h b/gcc/config/i386/tsxldtrkintrin.h index 08b76a9..eab36d0 100644 --- a/gcc/config/i386/tsxldtrkintrin.h +++ b/gcc/config/i386/tsxldtrkintrin.h @@ -1,5 +1,28 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use <tsxldtrkintrin.h> directly; include <immintrin.h> instead." +/* Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <tsxldtrkintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _TSXLDTRKINTRIN_H_INCLUDED diff --git a/gcc/config/i386/uintrintrin.h b/gcc/config/i386/uintrintrin.h new file mode 100644 index 0000000..991f642 --- /dev/null +++ b/gcc/config/i386/uintrintrin.h @@ -0,0 +1,87 @@ +/* Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead." +#endif + +#ifndef _UINTRNTRIN_H_INCLUDED +#define _UINTRNTRIN_H_INCLUDED + +#ifdef __x86_64__ + +#ifndef __UINTR__ +#pragma GCC push_options +#pragma GCC target ("uintr") +#define __DISABLE_UINTR__ +#endif /* __UINTR__ */ + +struct __uintr_frame +{ + /* The position of the most significant bit set in user-interrupt + request register. */ + unsigned long long uirrv; + /* RIP of the interrupted user process. */ + unsigned long long rip; + /* RFLAGS of the interrupted user process. */ + unsigned long long rflags; + /* RSP of the interrupted user process. */ + unsigned long long rsp; +}; + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_clui (void) +{ + __builtin_ia32_clui (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_stui (void) +{ + __builtin_ia32_stui (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_senduipi (unsigned long long __R) +{ + __builtin_ia32_senduipi (__R); +} + +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_testui (void) +{ + return __builtin_ia32_testui (); +} + +#ifdef __DISABLE_UINTR__ +#undef __DISABLE_UINTR__ +#pragma GCC pop_options +#endif /* __DISABLE_UINTR__ */ + +#endif + +#endif /* _UINTRNTRIN_H_INCLUDED. */ diff --git a/gcc/config/i386/waitpkgintrin.h b/gcc/config/i386/waitpkgintrin.h index 5dbcde3..5046c98 100644 --- a/gcc/config/i386/waitpkgintrin.h +++ b/gcc/config/i386/waitpkgintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <waitpkgintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <waitpkgintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _WAITPKG_H_INCLUDED diff --git a/gcc/config/i386/wbnoinvdintrin.h b/gcc/config/i386/wbnoinvdintrin.h index 5393698..7089e61 100644 --- a/gcc/config/i386/wbnoinvdintrin.h +++ b/gcc/config/i386/wbnoinvdintrin.h @@ -1,5 +1,28 @@ -#ifndef _IMMINTRIN_H_INCLUDED -#error "Never use <wbnoinvdintrin.h> directly; include <immintrin.h> instead." +/* Copyright (C) 2018-2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <wbnoinvdintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _WBNOINVDINTRIN_H_INCLUDED diff --git a/gcc/config/i386/x86gprintrin.h b/gcc/config/i386/x86gprintrin.h new file mode 100644 index 0000000..ffe07e4 --- /dev/null +++ b/gcc/config/i386/x86gprintrin.h @@ -0,0 +1,256 @@ +/* Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +#define _X86GPRINTRIN_H_INCLUDED + +#include <ia32intrin.h> + +#ifndef __iamcu__ + +#include <stddef.h> + +#include <adxintrin.h> + +#include <bmiintrin.h> + +#include <bmi2intrin.h> + +#include <cetintrin.h> + +#include <cldemoteintrin.h> + +#include <clflushoptintrin.h> + +#include <clwbintrin.h> + +#include <clzerointrin.h> + +#include <enqcmdintrin.h> + +#include <fxsrintrin.h> + +#include <lzcntintrin.h> + +#include <lwpintrin.h> + +#include <movdirintrin.h> + +#include <mwaitxintrin.h> + +#include <pconfigintrin.h> + +#include <popcntintrin.h> + +#include <pkuintrin.h> + +#include <rdseedintrin.h> + +#include <rtmintrin.h> + +#include <serializeintrin.h> + +#include <sgxintrin.h> + +#include <tbmintrin.h> + +#include <tsxldtrkintrin.h> + +#include <uintrintrin.h> + +#include <waitpkgintrin.h> + +#include <wbnoinvdintrin.h> + +#include <xsaveintrin.h> + +#include <xsavecintrin.h> + +#include <xsaveoptintrin.h> + +#include <xsavesintrin.h> + +#include <xtestintrin.h> + +#include <hresetintrin.h> + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wbinvd (void) +{ + __builtin_ia32_wbinvd (); +} + +#ifndef __RDRND__ +#pragma GCC push_options +#pragma GCC target("rdrnd") +#define __DISABLE_RDRND__ +#endif /* __RDRND__ */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand16_step (unsigned short *__P) +{ + return __builtin_ia32_rdrand16_step (__P); +} + +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand32_step (unsigned int *__P) +{ + return __builtin_ia32_rdrand32_step (__P); +} +#ifdef __DISABLE_RDRND__ +#undef __DISABLE_RDRND__ +#pragma GCC pop_options +#endif /* __DISABLE_RDRND__ */ + +#ifndef __RDPID__ +#pragma GCC push_options +#pragma GCC target("rdpid") +#define __DISABLE_RDPID__ +#endif /* __RDPID__ */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdpid_u32 (void) +{ + return __builtin_ia32_rdpid (); +} +#ifdef __DISABLE_RDPID__ +#undef __DISABLE_RDPID__ +#pragma GCC pop_options +#endif /* __DISABLE_RDPID__ */ + +#ifdef __x86_64__ + +#ifndef __FSGSBASE__ +#pragma GCC push_options +#pragma GCC target("fsgsbase") +#define __DISABLE_FSGSBASE__ +#endif /* __FSGSBASE__ */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readfsbase_u32 (void) +{ + return __builtin_ia32_rdfsbase32 (); +} + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readfsbase_u64 (void) +{ + return __builtin_ia32_rdfsbase64 (); +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readgsbase_u32 (void) +{ + return __builtin_ia32_rdgsbase32 (); +} + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readgsbase_u64 (void) +{ + return __builtin_ia32_rdgsbase64 (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writefsbase_u32 (unsigned int __B) +{ + __builtin_ia32_wrfsbase32 (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writefsbase_u64 (unsigned long long __B) +{ + __builtin_ia32_wrfsbase64 (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writegsbase_u32 (unsigned int __B) +{ + __builtin_ia32_wrgsbase32 (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writegsbase_u64 (unsigned long long __B) +{ + __builtin_ia32_wrgsbase64 (__B); +} +#ifdef __DISABLE_FSGSBASE__ +#undef __DISABLE_FSGSBASE__ +#pragma GCC pop_options +#endif /* __DISABLE_FSGSBASE__ */ + +#ifndef __RDRND__ +#pragma GCC push_options +#pragma GCC target("rdrnd") +#define __DISABLE_RDRND__ +#endif /* __RDRND__ */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand64_step (unsigned long long *__P) +{ + return __builtin_ia32_rdrand64_step (__P); +} +#ifdef __DISABLE_RDRND__ +#undef __DISABLE_RDRND__ +#pragma GCC pop_options +#endif /* __DISABLE_RDRND__ */ + +#endif /* __x86_64__ */ + +#ifndef __PTWRITE__ +#pragma GCC push_options +#pragma GCC target("ptwrite") +#define __DISABLE_PTWRITE__ +#endif + +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_ptwrite64 (unsigned long long __B) +{ + __builtin_ia32_ptwrite64 (__B); +} +#endif /* __x86_64__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_ptwrite32 (unsigned __B) +{ + __builtin_ia32_ptwrite32 (__B); +} +#ifdef __DISABLE_PTWRITE__ +#undef __DISABLE_PTWRITE__ +#pragma GCC pop_options +#endif /* __DISABLE_PTWRITE__ */ + +#endif /* __iamcu__ */ + +#endif /* _X86GPRINTRIN_H_INCLUDED. */ diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h index 59fdceb..bc6cb40 100644 --- a/gcc/config/i386/x86intrin.h +++ b/gcc/config/i386/x86intrin.h @@ -24,7 +24,7 @@ #ifndef _X86INTRIN_H_INCLUDED #define _X86INTRIN_H_INCLUDED -#include <ia32intrin.h> +#include <x86gprintrin.h> #ifndef __iamcu__ @@ -37,16 +37,6 @@ #include <xopintrin.h> -#include <lwpintrin.h> - -#include <tbmintrin.h> - -#include <popcntintrin.h> - -#include <mwaitxintrin.h> - -#include <clzerointrin.h> - #endif /* __iamcu__ */ #endif /* _X86INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/xsavecintrin.h b/gcc/config/i386/xsavecintrin.h index 039e215..06c9f36 100644 --- a/gcc/config/i386/xsavecintrin.h +++ b/gcc/config/i386/xsavecintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <xsavecintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _XSAVECINTRIN_H_INCLUDED diff --git a/gcc/config/i386/xsaveintrin.h b/gcc/config/i386/xsaveintrin.h index 9f0b8bb..f9cac0d 100644 --- a/gcc/config/i386/xsaveintrin.h +++ b/gcc/config/i386/xsaveintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <xsaveintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _XSAVEINTRIN_H_INCLUDED diff --git a/gcc/config/i386/xsaveoptintrin.h b/gcc/config/i386/xsaveoptintrin.h index 9da3297..4f2756b 100644 --- a/gcc/config/i386/xsaveoptintrin.h +++ b/gcc/config/i386/xsaveoptintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <xsaveoptintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _XSAVEOPTINTRIN_H_INCLUDED diff --git a/gcc/config/i386/xsavesintrin.h b/gcc/config/i386/xsavesintrin.h index 264f1c4..629a1f3 100644 --- a/gcc/config/i386/xsavesintrin.h +++ b/gcc/config/i386/xsavesintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#if !defined _IMMINTRIN_H_INCLUDED -# error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <xsavesintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _XSAVESINTRIN_H_INCLUDED diff --git a/gcc/config/i386/xtestintrin.h b/gcc/config/i386/xtestintrin.h index cb187e4..757cc34 100644 --- a/gcc/config/i386/xtestintrin.h +++ b/gcc/config/i386/xtestintrin.h @@ -21,8 +21,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef _IMMINTRIN_H_INCLUDED -# error "Never use <xtestintrin.h> directly; include <immintrin.h> instead." +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use <xtestintrin.h> directly; include <x86gprintrin.h> instead." #endif #ifndef _XTESTINTRIN_H_INCLUDED diff --git a/gcc/config/linux-protos.h b/gcc/config/linux-protos.h index 3759187..c52778b 100644 --- a/gcc/config/linux-protos.h +++ b/gcc/config/linux-protos.h @@ -19,4 +19,4 @@ along with GCC; see the file COPYING3. If not see extern bool linux_has_ifunc_p (void); -extern bool linux_libc_has_function (enum function_class fn_class); +extern bool linux_libc_has_function (enum function_class fn_class, tree); diff --git a/gcc/config/linux.c b/gcc/config/linux.c index 9876153..83ffff4 100644 --- a/gcc/config/linux.c +++ b/gcc/config/linux.c @@ -25,7 +25,8 @@ along with GCC; see the file COPYING3. If not see #include "linux-protos.h" bool -linux_libc_has_function (enum function_class fn_class) +linux_libc_has_function (enum function_class fn_class, + tree type ATTRIBUTE_UNUSED) { if (OPTION_GLIBC || OPTION_MUSL) return true; diff --git a/gcc/config/msp430/msp430.md b/gcc/config/msp430/msp430.md index f70e61b..ad244bb 100644 --- a/gcc/config/msp430/msp430.md +++ b/gcc/config/msp430/msp430.md @@ -1346,12 +1346,12 @@ ;; instructions, so we provide a pattern to support it here. (define_insn "andneghi3" [(set (match_operand:HI 0 "register_operand" "=r") - (and:HI (neg:HI (match_operand:HI 1 "register_operand" "r")) + (and:HI (neg:HI (match_operand:HI 1 "general_operand" "rm")) (match_operand 2 "immediate_operand" "n")))] "" "* if (REGNO (operands[0]) != REGNO (operands[1])) - return \"MOV.W\t%1, %0 { INV.W\t%0 { INC.W\t%0 { AND.W\t%2, %0\"; + return \"MOV%X1.W\t%1, %0 { INV.W\t%0 { INC.W\t%0 { AND.W\t%2, %0\"; else return \"INV.W\t%0 { INC.W\t%0 { AND.W\t%2, %0\"; " diff --git a/gcc/config/nvptx/mkoffload.c b/gcc/config/nvptx/mkoffload.c index 4fecb2b..a3c4099 100644 --- a/gcc/config/nvptx/mkoffload.c +++ b/gcc/config/nvptx/mkoffload.c @@ -399,7 +399,8 @@ compile_native (const char *infile, const char *outfile, const char *compiler, obstack_ptr_grow (&argv_obstack, NULL); const char **new_argv = XOBFINISH (&argv_obstack, const char **); - fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true); + fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true, + ".gccnative_args"); obstack_free (&argv_obstack, NULL); } @@ -582,7 +583,8 @@ main (int argc, char **argv) unsetenv ("COMPILER_PATH"); unsetenv ("LIBRARY_PATH"); - fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true); + fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true, + ".gcc_args"); obstack_free (&argv_obstack, NULL); xputenv (concat ("GCC_EXEC_PREFIX=", execpath, NULL)); @@ -594,6 +596,7 @@ main (int argc, char **argv) fatal_error (input_location, "cannot open intermediate ptx file"); process (in, out); + fclose (in); } fclose (out); diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 0c590d8..1734947 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -368,6 +368,22 @@ nvptx_name_replacement (const char *name) return name; } +/* Return NULL if NAME contains no dot. Otherwise return a copy of NAME + with the dots replaced with dollar signs. */ + +static char * +nvptx_replace_dot (const char *name) +{ + if (strchr (name, '.') == NULL) + return NULL; + + char *p = xstrdup (name); + for (size_t i = 0; i < strlen (p); ++i) + if (p[i] == '.') + p[i] = '$'; + return p; +} + /* If MODE should be treated as two registers of an inner mode, return that inner mode. Otherwise return VOIDmode. */ @@ -827,26 +843,12 @@ write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name) fputs ("\n", file); } -/* Write a .func or .kernel declaration or definition along with - a helper comment for use by ld. S is the stream to write to, DECL - the decl for the function with name NAME. For definitions, emit - a declaration too. */ +/* Helper function for write_fn_proto. */ -static const char * -write_fn_proto (std::stringstream &s, bool is_defn, - const char *name, const_tree decl) +static void +write_fn_proto_1 (std::stringstream &s, bool is_defn, + const char *name, const_tree decl) { - if (is_defn) - /* Emit a declaration. The PTX assembler gets upset without it. */ - name = write_fn_proto (s, false, name, decl); - else - { - /* Avoid repeating the name replacement. */ - name = nvptx_name_replacement (name); - if (name[0] == '*') - name++; - } - write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name); /* PTX declaration. */ @@ -929,8 +931,38 @@ write_fn_proto (std::stringstream &s, bool is_defn, s << ")"; s << (is_defn ? "\n" : ";\n"); +} - return name; +/* Write a .func or .kernel declaration or definition along with + a helper comment for use by ld. S is the stream to write to, DECL + the decl for the function with name NAME. For definitions, emit + a declaration too. */ + +static void +write_fn_proto (std::stringstream &s, bool is_defn, + const char *name, const_tree decl) +{ + const char *replacement = nvptx_name_replacement (name); + char *replaced_dots = NULL; + if (replacement != name) + name = replacement; + else + { + replaced_dots = nvptx_replace_dot (name); + if (replaced_dots) + name = replaced_dots; + } + if (name[0] == '*') + name++; + + if (is_defn) + /* Emit a declaration. The PTX assembler gets upset without it. */ + write_fn_proto_1 (s, false, name, decl); + + write_fn_proto_1 (s, is_defn, name, decl); + + if (replaced_dots) + XDELETE (replaced_dots); } /* Construct a function declaration from a call insn. This can be @@ -942,6 +974,8 @@ static void write_fn_proto_from_insn (std::stringstream &s, const char *name, rtx result, rtx pat) { + char *replaced_dots = NULL; + if (!name) { s << "\t.callprototype "; @@ -949,7 +983,15 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name, } else { - name = nvptx_name_replacement (name); + const char *replacement = nvptx_name_replacement (name); + if (replacement != name) + name = replacement; + else + { + replaced_dots = nvptx_replace_dot (name); + if (replaced_dots) + name = replaced_dots; + } write_fn_marker (s, false, true, name); s << "\t.extern .func "; } @@ -958,6 +1000,8 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name, write_return_mode (s, true, GET_MODE (result)); s << name; + if (replaced_dots) + XDELETE (replaced_dots); int arg_end = XVECLEN (pat, 0); for (int i = 1; i < arg_end; i++) @@ -2101,7 +2145,7 @@ nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p)) val = INTVAL (XEXP (x, 1)); x = XEXP (x, 0); gcc_assert (GET_CODE (x) == SYMBOL_REF); - /* FALLTHROUGH */ + gcc_fallthrough (); /* FALLTHROUGH */ case SYMBOL_REF: gcc_assert (size == init_frag.size); @@ -2349,6 +2393,7 @@ const char * nvptx_output_mov_insn (rtx dst, rtx src) { machine_mode dst_mode = GET_MODE (dst); + machine_mode src_mode = GET_MODE (src); machine_mode dst_inner = (GET_CODE (dst) == SUBREG ? GET_MODE (XEXP (dst, 0)) : dst_mode); machine_mode src_inner = (GET_CODE (src) == SUBREG @@ -2375,7 +2420,7 @@ nvptx_output_mov_insn (rtx dst, rtx src) if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner)) { if (GET_MODE_BITSIZE (dst_mode) == 128 - && GET_MODE_BITSIZE (GET_MODE (src)) == 128) + && GET_MODE_BITSIZE (src_mode) == 128) { /* mov.b128 is not supported. */ if (dst_inner == V2DImode && src_inner == TImode) @@ -2388,6 +2433,10 @@ nvptx_output_mov_insn (rtx dst, rtx src) return "%.\tmov.b%T0\t%0, %1;"; } + if (GET_MODE_BITSIZE (src_inner) == 128 + && GET_MODE_BITSIZE (src_mode) == 64) + return "%.\tmov.b%T0\t%0, %1;"; + return "%.\tcvt%t0%t1\t%0, %1;"; } @@ -2458,9 +2507,20 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) if (decl) { + char *replaced_dots = NULL; const char *name = get_fnname_from_decl (decl); - name = nvptx_name_replacement (name); + const char *replacement = nvptx_name_replacement (name); + if (replacement != name) + name = replacement; + else + { + replaced_dots = nvptx_replace_dot (name); + if (replaced_dots) + name = replaced_dots; + } assemble_name (asm_out_file, name); + if (replaced_dots) + XDELETE (replaced_dots); } else output_address (VOIDmode, callee); @@ -2598,7 +2658,7 @@ nvptx_print_operand (FILE *file, rtx x, int code) { case 'A': x = XEXP (x, 0); - /* FALLTHROUGH. */ + gcc_fallthrough (); /* FALLTHROUGH. */ case 'D': if (GET_CODE (x) == CONST) @@ -6531,6 +6591,23 @@ nvptx_set_current_function (tree fndecl) oacc_bcast_partition = 0; } +/* Implement TARGET_LIBC_HAS_FUNCTION. */ + +bool +nvptx_libc_has_function (enum function_class fn_class, tree type) +{ + if (fn_class == function_sincos) + { + if (type != NULL_TREE) + /* Currently, newlib does not support sincosl. */ + return type == float_type_node || type == double_type_node; + else + return true; + } + + return default_libc_has_function (fn_class, type); +} + #undef TARGET_OPTION_OVERRIDE #define TARGET_OPTION_OVERRIDE nvptx_option_override @@ -6676,6 +6753,9 @@ nvptx_set_current_function (tree fndecl) #undef TARGET_SET_CURRENT_FUNCTION #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function +#undef TARGET_LIBC_HAS_FUNCTION +#define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-nvptx.h" diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index 6ebcc76..17fe157 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -29,7 +29,10 @@ #define STARTFILE_SPEC "%{mmainkernel:crt0.o}" -#define ASM_SPEC "%{misa=*:-m %*}" +/* Default needs to be in sync with default for misa in nvptx.opt. + We add a default here to work around a hard-coded sm_30 default in + nvptx-as. */ +#define ASM_SPEC "%{misa=*:-m %*; :-m sm_35}" #define TARGET_CPU_CPP_BUILTINS() \ do \ diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 6178e6a..ccbcd09 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -146,6 +146,13 @@ return true; }) +;; Test for a function symbol ref operand +(define_predicate "symbol_ref_function_operand" + (match_code "symbol_ref") +{ + return SYMBOL_REF_FUNCTION_P (op); +}) + (define_attr "predicable" "false,true" (const_string "true")) @@ -241,6 +248,17 @@ } [(set_attr "subregs_ok" "true")]) +;; ptxas segfaults on 'mov.u64 %r24,bar+4096', so break it up. +(define_split + [(set (match_operand:DI 0 "nvptx_register_operand") + (const:DI (plus:DI (match_operand:DI 1 "symbol_ref_function_operand") + (match_operand 2 "const_int_operand"))))] + "" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 0) (plus:DI (match_dup 0) (match_dup 2))) + ] + "") + (define_insn "*mov<mode>_insn" [(set (match_operand:SDFM 0 "nonimmediate_operand" "=R,R,m") (match_operand:SDFM 1 "general_operand" "RF,m,R"))] @@ -365,9 +383,13 @@ [(set (match_operand:QHIM 0 "nvptx_nonimmediate_operand" "=R,m") (truncate:QHIM (match_operand:SI 1 "nvptx_register_operand" "R,R")))] "" - "@ - %.\\tcvt%t0.u32\\t%0, %1; - %.\\tst%A0.u%T0\\t%0, %1;" + { + if (which_alternative == 1) + return "%.\\tst%A0.u%T0\\t%0, %1;"; + if (GET_MODE (operands[0]) == QImode) + return "%.\\tmov%t0\\t%0, %1;"; + return "%.\\tcvt%t0.u32\\t%0, %1;"; + } [(set_attr "subregs_ok" "true")]) (define_insn "truncdi<mode>2" diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 75c3d54..045e354 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -17,9 +17,11 @@ ; along with GCC; see the file COPYING3. If not see ; <http://www.gnu.org/licenses/>. -m32 -Target Report RejectNegative InverseMask(ABI64) -Generate code for a 32-bit ABI. +; It's not clear whether this was ever build/tested/used, so this is no longer +; exposed to the user. +;m32 +;Target Report RejectNegative InverseMask(ABI64) +;Generate code for a 32-bit ABI. m64 Target Report RejectNegative Mask(ABI64) @@ -37,7 +39,7 @@ msoft-stack Target Report Mask(SOFT_STACK) Use custom stacks instead of local memory for automatic storage. -msoft-stack-reserve-local +msoft-stack-reserve-local= Target Report Joined RejectNegative UInteger Var(nvptx_softstack_size) Init(128) Specify size of .local memory used for stack when the exact amount is not known. @@ -59,6 +61,7 @@ Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30) EnumValue Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35) +; Default needs to be in sync with default in ASM_SPEC in nvptx.h. misa= -Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM30) +Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM35) Specify the version of the ptx ISA to use. diff --git a/gcc/config/pa/pa-hpux11.h b/gcc/config/pa/pa-hpux11.h index 794bf8e..2820720 100644 --- a/gcc/config/pa/pa-hpux11.h +++ b/gcc/config/pa/pa-hpux11.h @@ -154,11 +154,6 @@ along with GCC; see the file COPYING3. If not see %{!mt:%{!pthread:-a shared -lc -a archive}}}}\ %{shared:%{mt|pthread:-lpthread}}" -/* The libgcc_stub.a library needs to come last. */ -#undef LINK_GCC_C_SEQUENCE_SPEC -#define LINK_GCC_C_SEQUENCE_SPEC \ - "%G %{!nolibc:%L} %G %{!nostdlib:%{!nodefaultlibs:%{!shared:-lgcc_stub}}}" - #undef STARTFILE_SPEC #define STARTFILE_SPEC \ "%{!shared:%{pg:gcrt0%O%s}%{!pg:%{p:mcrt0%O%s}%{!p:crt0%O%s}} \ diff --git a/gcc/config/pa/pa32-linux.h b/gcc/config/pa/pa32-linux.h index f271bbf..970722a 100644 --- a/gcc/config/pa/pa32-linux.h +++ b/gcc/config/pa/pa32-linux.h @@ -57,6 +57,11 @@ call_ ## FUNC (void) \ } #endif +/* We need to link against libgcc.a for __canonicalize_funcptr_for_compare + and $$dyncall. */ +#undef ENDFILE_SPEC +#define ENDFILE_SPEC GNU_USER_TARGET_ENDFILE_SPEC "libgcc.a%s" + #undef WCHAR_TYPE #define WCHAR_TYPE "long int" diff --git a/gcc/config/pa/pa64-hpux.h b/gcc/config/pa/pa64-hpux.h index c7d127f7..096aa4b 100644 --- a/gcc/config/pa/pa64-hpux.h +++ b/gcc/config/pa/pa64-hpux.h @@ -103,12 +103,6 @@ along with GCC; see the file COPYING3. If not see %{shared:%{mt|pthread:-lpthread}}" #endif -/* The libgcc_stub.a and milli.a libraries need to come last. */ -#undef LINK_GCC_C_SEQUENCE_SPEC -#define LINK_GCC_C_SEQUENCE_SPEC "\ - %G %{!nolibc:%L} %G %{!nostdlib:%{!nodefaultlibs:%{!shared:-lgcc_stub}\ - milli.a%s}}" - /* Under hpux11, the normal location of the `ld' and `as' programs is the /usr/ccs/bin directory. */ @@ -335,8 +329,12 @@ do { \ %{static:crtbeginT%O%s} %{!static:%{!shared:crtbegin%O%s} \ %{shared:crtbeginS%O%s}}" #endif + +/* The libgcc_stub.a and milli.a libraries must come last. We need + to link with these libraries whenever start files are needed. */ #undef ENDFILE_SPEC -#define ENDFILE_SPEC "%{!shared:crtend%O%s} %{shared:crtendS%O%s}" +#define ENDFILE_SPEC \ + "%{!shared:crtend%O%s libgcc_stub.a%s} %{shared:crtendS%O%s} milli.a%s" /* Since HP uses the .init and .fini sections for array initializers and finalizers, we need different defines for INIT_SECTION_ASM_OP diff --git a/gcc/config/riscv/multilib-generator b/gcc/config/riscv/multilib-generator index 8f4df18..57ee7c3 100755 --- a/gcc/config/riscv/multilib-generator +++ b/gcc/config/riscv/multilib-generator @@ -22,14 +22,26 @@ # Each argument to this script is of the form # <primary arch>-<abi>-<additional arches>-<extensions> -# For example, +# Example 1: # rv32imafd-ilp32d-rv32g-c,v # means that, in addition to rv32imafd, these configurations can also use the # rv32imafd-ilp32d libraries: rv32imafdc, rv32imafdv, rv32g, rv32gc, rv32gv +# +# Example 2: +# rv32imafd-ilp32d--c*b +# means that, in addition to rv32imafd, these configurations can also use the +# rv32imafd-ilp32d libraries: rv32imafdc-ilp32d, rv32imafdb-ilp32d, +# rv32imafdcb-ilp32d from __future__ import print_function import sys import collections +import itertools +from functools import reduce + +# +# TODO: Add test for this script. +# arches = collections.OrderedDict() abis = collections.OrderedDict() @@ -37,37 +49,53 @@ required = [] reuse = [] canonical_order = "mafdgqlcbjtpvn" +LONG_EXT_PREFIXES = ['z', 's', 'h', 'x'] + +# +# IMPLIED_EXT(ext) -> implied extension list. +# +IMPLIED_EXT = { + "d" : ["f"], +} def arch_canonicalize(arch): - # TODO: Support implied extensions, e.g. D implied F in latest spec. # TODO: Support extension version. new_arch = "" if arch[:5] in ['rv32e', 'rv32i', 'rv32g', 'rv64i', 'rv64g']: - # TODO: We should expand g to imadzifencei once we support newer spec. + # TODO: We should expand g to imad_zifencei once we support newer spec. new_arch = arch[:5].replace("g", "imafd") else: raise Exception("Unexpected arch: `%s`" % arch[:5]) # Find any Z, S, H or X - long_ext_prefixes = ['z', 's', 'h', 'x'] - long_ext_prefixes_idx = map(lambda x: arch.find(x), long_ext_prefixes) + long_ext_prefixes_idx = map(lambda x: arch.find(x), LONG_EXT_PREFIXES) # Filter out any non-existent index. long_ext_prefixes_idx = list(filter(lambda x: x != -1, long_ext_prefixes_idx)) if long_ext_prefixes_idx: first_long_ext_idx = min(long_ext_prefixes_idx) long_exts = arch[first_long_ext_idx:].split("_") - std_exts = arch[5:first_long_ext_idx] + std_exts = list(arch[5:first_long_ext_idx]) else: long_exts = [] - std_exts = arch[5:] + std_exts = list(arch[5:]) + + # + # Handle implied extensions. + # + for ext in std_exts + long_exts: + if ext in IMPLIED_EXT: + implied_exts = IMPLIED_EXT[ext] + for implied_ext in implied_exts: + if implied_ext not in std_exts + long_exts: + long_exts.append(implied_ext) # Single letter extension might appear in the long_exts list, # becasue we just append extensions list to the arch string. - std_exts += "".join(filter(lambda x:len(x) == 1, long_exts)) + std_exts += list(filter(lambda x:len(x) == 1, long_exts)) # Multi-letter extension must be in lexicographic order. - long_exts = sorted(filter(lambda x:len(x) != 1, long_exts)) + long_exts = list(sorted(filter(lambda x:len(x) != 1, long_exts))) # Put extensions in canonical order. for ext in canonical_order: @@ -86,15 +114,98 @@ def arch_canonicalize(arch): new_arch += "_" + "_".join(long_exts) return new_arch +# +# add underline for each multi-char extensions. +# e.g. ["a", "zfh"] -> ["a", "_zfh"] +# +def add_underline_prefix(ext): + for long_ext_prefix in LONG_EXT_PREFIXES: + if ext.startswith(long_ext_prefix): + return "_" + ext + + return ext + +# +# Handle expansion operation. +# +# e.g. "a*b" -> [("a",), ("b",), ("a", "b")] +# "a" -> [("a",)] +# +def _expand_combination(ext): + exts = list(ext.split("*")) + + # No need to expand if there is no `*`. + if len(exts) == 1: + return [(exts[0],)] + + # Add underline to every extension. + # e.g. + # _b * zvamo => _b * _zvamo + exts = list(map(lambda x: '_' + x, exts)) + + # Generate combination! + ext_combs = [] + for comb_len in range(1, len(exts)+1): + for ext_comb in itertools.combinations(exts, comb_len): + ext_combs.append(ext_comb) + + return ext_combs + +# +# Input a list and drop duplicated entry. +# e.g. +# ["a", "b", "ab", "a"] -> ["a", "b", "ab"] +# +def unique(x): + # + # Drop duplicated entry. + # Convert list to set and then convert back to list. + # + # Add sorted to prevent non-deterministic results in different env. + # + return list(sorted(list(set(x)))) + +# +# Expand EXT string if there is any expansion operator (*). +# e.g. +# "a*b,c" -> ["a", "b", "ab", "c"] +# +def expand_combination(ext): + ext = list(filter(None, ext.split(','))) + + # Expand combination for EXT, got lots of list. + # e.g. + # a * b => [[("a",), ("b",)], [("a", "b")]] + ext_combs = list(map(_expand_combination, ext)) + + # Then fold to single list. + # e.g. + # [[("a",), ("b",)], [("a", "b")]] => [("a",), ("b",), ("a", "b")] + ext = list(reduce(lambda x, y: x + y, ext_combs, [])) + + # Fold the tuple to string. + # e.g. + # [("a",), ("b",), ("a", "b")] => ["a", "b", "ab"] + ext = map(lambda e : reduce(lambda x, y: x + y, e), ext) + + # Drop duplicated entry. + ext = unique(ext) + + return ext + for cfg in sys.argv[1:]: (arch, abi, extra, ext) = cfg.split('-') arch = arch_canonicalize (arch) arches[arch] = 1 abis[abi] = 1 extra = list(filter(None, extra.split(','))) - ext = list(filter(None, ext.split(','))) - alts = sum([[x] + [x + "_" + y for y in ext] for x in [arch] + extra], []) + ext_combs = expand_combination(ext) + alts = sum([[x] + [x + y for y in ext_combs] for x in [arch] + extra], []) alts = list(map(arch_canonicalize, alts)) + + # Drop duplicated entry. + alts = unique(alts) + for alt in alts[1:]: arches[alt] = 1 reuse.append('march.%s/mabi.%s=march.%s/mabi.%s' % (arch, abi, alt, abi)) diff --git a/gcc/config/riscv/riscv-c.c b/gcc/config/riscv/riscv-c.c index 735f2f2..c600badb 100644 --- a/gcc/config/riscv/riscv-c.c +++ b/gcc/config/riscv/riscv-c.c @@ -90,12 +90,15 @@ riscv_cpu_cpp_builtins (cpp_reader *pfile) builtin_define ("__riscv_cmodel_medlow"); break; + case CM_PIC: + /* __riscv_cmodel_pic is deprecated, and will removed in next GCC release. + see https://github.com/riscv/riscv-c-api-doc/pull/11 */ + builtin_define ("__riscv_cmodel_pic"); + /* FALLTHROUGH. */ + case CM_MEDANY: builtin_define ("__riscv_cmodel_medany"); break; - case CM_PIC: - builtin_define ("__riscv_cmodel_pic"); - break; } } diff --git a/gcc/config/riscv/riscv-cores.def b/gcc/config/riscv/riscv-cores.def new file mode 100644 index 0000000..6a13f3e --- /dev/null +++ b/gcc/config/riscv/riscv-cores.def @@ -0,0 +1,49 @@ +/* List of supported core and tune info for RISC-V. + Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +/* This is a list of cores that implement RISC-V. + + Before using #include to read this file, define a macro: + + RISCV_CORE(CORE_NAME, ARCH, MICRO_ARCH, TUNE_INFO) + + The CORE_NAME is the name of the core, represented as a string. + The ARCH is the default arch of the core, represented as a string, + can be NULL if no default arch. + The MICRO_ARCH is the name of the core for which scheduling decisions + will be made, represented as an identifier. + The TUNE_INFO is the detail cost model for this core, represented as an + identifier, reference to riscv-tunes.def. */ + +RISCV_CORE("sifive-e20", "rv32imc", "rocket") +RISCV_CORE("sifive-e21", "rv32imac", "rocket") +RISCV_CORE("sifive-e24", "rv32imafc", "rocket") +RISCV_CORE("sifive-e31", "rv32imac", "sifive-3-series") +RISCV_CORE("sifive-e34", "rv32imafc", "sifive-3-series") +RISCV_CORE("sifive-e76", "rv32imafc", "sifive-7-series") + +RISCV_CORE("sifive-s21", "rv64imac", "rocket") +RISCV_CORE("sifive-s51", "rv64imac", "sifive-5-series") +RISCV_CORE("sifive-s54", "rv64imafdc", "sifive-5-series") +RISCV_CORE("sifive-s76", "rv64imafdc", "sifive-7-series") + +RISCV_CORE("sifive-u54", "rv64imafdc", "sifive-5-series") +RISCV_CORE("sifive-u74", "rv64imafdc", "sifive-7-series") + +#undef RISCV_CORE diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 358224a..256dab1 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -94,4 +94,18 @@ extern bool riscv_hard_regno_rename_ok (unsigned, unsigned); rtl_opt_pass * make_pass_shorten_memrefs (gcc::context *ctxt); +/* Information about one CPU we know about. */ +struct riscv_cpu_info { + /* This CPU's canonical name. */ + const char *name; + + /* Default arch for this CPU, could be NULL if no default arch. */ + const char *arch; + + /* Which automaton to use for tuning. */ + const char *tune; +}; + +extern const riscv_cpu_info *riscv_find_cpu (const char *); + #endif /* ! GCC_RISCV_PROTOS_H */ diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index 63b0c38..989a9f1 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -209,7 +209,7 @@ struct riscv_integer_op { /* Costs of various operations on the different architectures. */ -struct riscv_tune_info +struct riscv_tune_param { unsigned short fp_add[2]; unsigned short fp_mul[2]; @@ -222,16 +222,16 @@ struct riscv_tune_info bool slow_unaligned_access; }; -/* Information about one CPU we know about. */ -struct riscv_cpu_info { - /* This CPU's canonical name. */ +/* Information about one micro-arch we know about. */ +struct riscv_tune_info { + /* This micro-arch canonical name. */ const char *name; /* Which automaton to use for tuning. */ enum riscv_microarchitecture_type microarchitecture; - /* Tuning parameters for this CPU. */ - const struct riscv_tune_info *tune_info; + /* Tuning parameters for this micro-arch. */ + const struct riscv_tune_param *tune_param; }; /* Global variables for machine-dependent things. */ @@ -248,7 +248,7 @@ unsigned riscv_stack_boundary; static int epilogue_cfa_sp_offset; /* Which tuning parameters to use. */ -static const struct riscv_tune_info *tune_info; +static const struct riscv_tune_param *tune_param; /* Which automaton to use for tuning. */ enum riscv_microarchitecture_type riscv_microarchitecture; @@ -275,7 +275,7 @@ const enum reg_class riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = { }; /* Costs to use when optimizing for rocket. */ -static const struct riscv_tune_info rocket_tune_info = { +static const struct riscv_tune_param rocket_tune_info = { {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_add */ {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_mul */ {COSTS_N_INSNS (20), COSTS_N_INSNS (20)}, /* fp_div */ @@ -288,7 +288,7 @@ static const struct riscv_tune_info rocket_tune_info = { }; /* Costs to use when optimizing for Sifive 7 Series. */ -static const struct riscv_tune_info sifive_7_tune_info = { +static const struct riscv_tune_param sifive_7_tune_info = { {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_add */ {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_mul */ {COSTS_N_INSNS (20), COSTS_N_INSNS (20)}, /* fp_div */ @@ -301,7 +301,7 @@ static const struct riscv_tune_info sifive_7_tune_info = { }; /* Costs to use when optimizing for size. */ -static const struct riscv_tune_info optimize_size_tune_info = { +static const struct riscv_tune_param optimize_size_tune_info = { {COSTS_N_INSNS (1), COSTS_N_INSNS (1)}, /* fp_add */ {COSTS_N_INSNS (1), COSTS_N_INSNS (1)}, /* fp_mul */ {COSTS_N_INSNS (1), COSTS_N_INSNS (1)}, /* fp_div */ @@ -343,7 +343,7 @@ static const unsigned gpr_save_reg_order[] = { }; /* A table describing all the processors GCC knows about. */ -static const struct riscv_cpu_info riscv_cpu_info_table[] = { +static const struct riscv_tune_info riscv_tune_info_table[] = { { "rocket", generic, &rocket_tune_info }, { "sifive-3-series", generic, &rocket_tune_info }, { "sifive-5-series", generic, &rocket_tune_info }, @@ -351,17 +351,22 @@ static const struct riscv_cpu_info riscv_cpu_info_table[] = { { "size", generic, &optimize_size_tune_info }, }; -/* Return the riscv_cpu_info entry for the given name string. */ +/* Return the riscv_tune_info entry for the given name string. */ -static const struct riscv_cpu_info * -riscv_parse_cpu (const char *cpu_string) +static const struct riscv_tune_info * +riscv_parse_tune (const char *tune_string) { - for (unsigned i = 0; i < ARRAY_SIZE (riscv_cpu_info_table); i++) - if (strcmp (riscv_cpu_info_table[i].name, cpu_string) == 0) - return riscv_cpu_info_table + i; + const riscv_cpu_info *cpu = riscv_find_cpu (tune_string); - error ("unknown cpu %qs for %<-mtune%>", cpu_string); - return riscv_cpu_info_table; + if (cpu) + tune_string = cpu->tune; + + for (unsigned i = 0; i < ARRAY_SIZE (riscv_tune_info_table); i++) + if (strcmp (riscv_tune_info_table[i].name, tune_string) == 0) + return riscv_tune_info_table + i; + + error ("unknown cpu %qs for %<-mtune%>", tune_string); + return riscv_tune_info_table; } /* Helper function for riscv_build_integer; arguments are as for @@ -1703,7 +1708,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN instructions it needs. */ if ((cost = riscv_address_insns (XEXP (x, 0), mode, true)) > 0) { - *total = COSTS_N_INSNS (cost + tune_info->memory_cost); + *total = COSTS_N_INSNS (cost + tune_param->memory_cost); return true; } /* Otherwise use the default handling. */ @@ -1770,7 +1775,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN mode instead. */ mode = GET_MODE (XEXP (x, 0)); if (float_mode_p) - *total = tune_info->fp_add[mode == DFmode]; + *total = tune_param->fp_add[mode == DFmode]; else *total = riscv_binary_cost (x, 1, 3); return false; @@ -1779,19 +1784,19 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN case ORDERED: /* (FEQ(A, A) & FEQ(B, B)) compared against 0. */ mode = GET_MODE (XEXP (x, 0)); - *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (2); + *total = tune_param->fp_add[mode == DFmode] + COSTS_N_INSNS (2); return false; case UNEQ: /* (FEQ(A, A) & FEQ(B, B)) compared against FEQ(A, B). */ mode = GET_MODE (XEXP (x, 0)); - *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (3); + *total = tune_param->fp_add[mode == DFmode] + COSTS_N_INSNS (3); return false; case LTGT: /* (FLT(A, A) || FGT(B, B)). */ mode = GET_MODE (XEXP (x, 0)); - *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (2); + *total = tune_param->fp_add[mode == DFmode] + COSTS_N_INSNS (2); return false; case UNGE: @@ -1800,13 +1805,13 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN case UNLT: /* FLT or FLE, but guarded by an FFLAGS read and write. */ mode = GET_MODE (XEXP (x, 0)); - *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (4); + *total = tune_param->fp_add[mode == DFmode] + COSTS_N_INSNS (4); return false; case MINUS: case PLUS: if (float_mode_p) - *total = tune_info->fp_add[mode == DFmode]; + *total = tune_param->fp_add[mode == DFmode]; else *total = riscv_binary_cost (x, 1, 4); return false; @@ -1816,7 +1821,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN rtx op = XEXP (x, 0); if (GET_CODE (op) == FMA && !HONOR_SIGNED_ZEROS (mode)) { - *total = (tune_info->fp_mul[mode == DFmode] + *total = (tune_param->fp_mul[mode == DFmode] + set_src_cost (XEXP (op, 0), mode, speed) + set_src_cost (XEXP (op, 1), mode, speed) + set_src_cost (XEXP (op, 2), mode, speed)); @@ -1825,23 +1830,23 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN } if (float_mode_p) - *total = tune_info->fp_add[mode == DFmode]; + *total = tune_param->fp_add[mode == DFmode]; else *total = COSTS_N_INSNS (GET_MODE_SIZE (mode) > UNITS_PER_WORD ? 4 : 1); return false; case MULT: if (float_mode_p) - *total = tune_info->fp_mul[mode == DFmode]; + *total = tune_param->fp_mul[mode == DFmode]; else if (!TARGET_MUL) /* Estimate the cost of a library call. */ *total = COSTS_N_INSNS (speed ? 32 : 6); else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) - *total = 3 * tune_info->int_mul[0] + COSTS_N_INSNS (2); + *total = 3 * tune_param->int_mul[0] + COSTS_N_INSNS (2); else if (!speed) *total = COSTS_N_INSNS (1); else - *total = tune_info->int_mul[mode == DImode]; + *total = tune_param->int_mul[mode == DImode]; return false; case DIV: @@ -1849,7 +1854,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN case MOD: if (float_mode_p) { - *total = tune_info->fp_div[mode == DFmode]; + *total = tune_param->fp_div[mode == DFmode]; return false; } /* Fall through. */ @@ -1860,7 +1865,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN /* Estimate the cost of a library call. */ *total = COSTS_N_INSNS (speed ? 32 : 6); else if (speed) - *total = tune_info->int_div[mode == DImode]; + *total = tune_param->int_div[mode == DImode]; else *total = COSTS_N_INSNS (1); return false; @@ -1882,11 +1887,11 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN case FIX: case FLOAT_EXTEND: case FLOAT_TRUNCATE: - *total = tune_info->fp_add[mode == DFmode]; + *total = tune_param->fp_add[mode == DFmode]; return false; case FMA: - *total = (tune_info->fp_mul[mode == DFmode] + *total = (tune_param->fp_mul[mode == DFmode] + set_src_cost (XEXP (x, 0), mode, speed) + set_src_cost (XEXP (x, 1), mode, speed) + set_src_cost (XEXP (x, 2), mode, speed)); @@ -4546,7 +4551,7 @@ riscv_class_max_nregs (reg_class_t rclass, machine_mode mode) static int riscv_memory_move_cost (machine_mode mode, reg_class_t rclass, bool in) { - return (tune_info->memory_cost + return (tune_param->memory_cost + memory_move_secondary_cost (mode, rclass, in)); } @@ -4555,7 +4560,7 @@ riscv_memory_move_cost (machine_mode mode, reg_class_t rclass, bool in) static int riscv_issue_rate (void) { - return tune_info->issue_rate; + return tune_param->issue_rate; } /* Auxiliary function to emit RISC-V ELF attribute. */ @@ -4683,7 +4688,7 @@ riscv_init_machine_status (void) static void riscv_option_override (void) { - const struct riscv_cpu_info *cpu; + const struct riscv_tune_info *cpu; #ifdef SUBTARGET_OVERRIDE_OPTIONS SUBTARGET_OVERRIDE_OPTIONS; @@ -4705,26 +4710,28 @@ riscv_option_override (void) if (TARGET_HARD_FLOAT && (target_flags_explicit & MASK_FDIV) == 0) target_flags |= MASK_FDIV; - /* Handle -mtune. */ - cpu = riscv_parse_cpu (riscv_tune_string ? riscv_tune_string : - RISCV_TUNE_STRING_DEFAULT); + /* Handle -mtune, use -mcpu if -mtune is not given, and use default -mtune + if -mtune and -mcpu both not not given. */ + cpu = riscv_parse_tune (riscv_tune_string ? riscv_tune_string : + (riscv_cpu_string ? riscv_cpu_string : + RISCV_TUNE_STRING_DEFAULT)); riscv_microarchitecture = cpu->microarchitecture; - tune_info = optimize_size ? &optimize_size_tune_info : cpu->tune_info; + tune_param = optimize_size ? &optimize_size_tune_info : cpu->tune_param; /* Use -mtune's setting for slow_unaligned_access, even when optimizing for size. For architectures that trap and emulate unaligned accesses, the performance cost is too great, even for -Os. Similarly, if -m[no-]strict-align is left unspecified, heed -mtune's advice. */ - riscv_slow_unaligned_access_p = (cpu->tune_info->slow_unaligned_access + riscv_slow_unaligned_access_p = (cpu->tune_param->slow_unaligned_access || TARGET_STRICT_ALIGN); if ((target_flags_explicit & MASK_STRICT_ALIGN) == 0 - && cpu->tune_info->slow_unaligned_access) + && cpu->tune_param->slow_unaligned_access) target_flags |= MASK_STRICT_ALIGN; /* If the user hasn't specified a branch cost, use the processor's default. */ if (riscv_branch_cost == 0) - riscv_branch_cost = tune_info->branch_cost; + riscv_branch_cost = tune_param->branch_cost; /* Function to allocate machine-dependent function status. */ init_machine_status = &riscv_init_machine_status; diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index b7b4a1c..172c7ca 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -41,17 +41,27 @@ along with GCC; see the file COPYING3. If not see #endif extern const char *riscv_expand_arch (int argc, const char **argv); +extern const char *riscv_expand_arch_from_cpu (int argc, const char **argv); +extern const char *riscv_default_mtune (int argc, const char **argv); # define EXTRA_SPEC_FUNCTIONS \ - { "riscv_expand_arch", riscv_expand_arch }, + { "riscv_expand_arch", riscv_expand_arch }, \ + { "riscv_expand_arch_from_cpu", riscv_expand_arch_from_cpu }, \ + { "riscv_default_mtune", riscv_default_mtune }, /* Support for a compile-time default CPU, et cetera. The rules are: - --with-arch is ignored if -march is specified. + --with-arch is ignored if -march or -mcpu is specified. --with-abi is ignored if -mabi is specified. - --with-tune is ignored if -mtune is specified. */ + --with-tune is ignored if -mtune or -mcpu is specified. + + But using default -march/-mtune value if -mcpu don't have valid option. */ #define OPTION_DEFAULT_SPECS \ - {"tune", "%{!mtune=*:-mtune=%(VALUE)}" }, \ - {"arch", "%{!march=*:-march=%(VALUE)}" }, \ + {"tune", "%{!mtune=*:" \ + " %{!mcpu=*:-mtune=%(VALUE)}" \ + " %{mcpu=*:-mtune=%:riscv_default_mtune(%* %(VALUE))}}" }, \ + {"arch", "%{!march=*:" \ + " %{!mcpu=*:-march=%(VALUE)}" \ + " %{mcpu=*:%:riscv_expand_arch_from_cpu(%* %(VALUE))}}" }, \ {"abi", "%{!mabi=*:-mabi=%(VALUE)}" }, \ #ifdef IN_LIBGCC2 @@ -69,8 +79,9 @@ extern const char *riscv_expand_arch (int argc, const char **argv); %(subtarget_asm_spec)" #undef DRIVER_SELF_SPECS -#define DRIVER_SELF_SPECS \ -"%{march=*:-march=%:riscv_expand_arch(%*)}" +#define DRIVER_SELF_SPECS \ +"%{march=*:%:riscv_expand_arch(%*)} " \ +"%{!march=*:%{mcpu=*:%:riscv_expand_arch_from_cpu(%*)}} " #define TARGET_DEFAULT_CMODEL CM_MEDLOW diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt index f01d3ab..808b4a0 100644 --- a/gcc/config/riscv/riscv.opt +++ b/gcc/config/riscv/riscv.opt @@ -79,6 +79,10 @@ mtune= Target RejectNegative Joined Var(riscv_tune_string) -mtune=PROCESSOR Optimize the output for PROCESSOR. +mcpu= +Target RejectNegative Joined Var(riscv_cpu_string) +-mcpu=PROCESSOR Use architecture of and optimize the output for PROCESSOR. + msmall-data-limit= Target Joined Separate UInteger Var(g_switch_value) Init(8) -msmall-data-limit=N Put global and static data smaller than <number> bytes into a special section (on some targets). diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv index 4820fb3..702767c 100644 --- a/gcc/config/riscv/t-riscv +++ b/gcc/config/riscv/t-riscv @@ -24,3 +24,5 @@ riscv-shorten-memrefs.o: $(srcdir)/config/riscv/riscv-shorten-memrefs.c $(POSTCOMPILE) PASSES_EXTRA += $(srcdir)/config/riscv/riscv-passes.def + +$(common_out_file): $(srcdir)/config/riscv/riscv-cores.def diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h index 8a2dcda..df10a8c 100644 --- a/gcc/config/rs6000/altivec.h +++ b/gcc/config/rs6000/altivec.h @@ -236,6 +236,9 @@ #define vec_lvebx __builtin_vec_lvebx #define vec_lvehx __builtin_vec_lvehx #define vec_lvewx __builtin_vec_lvewx +#define vec_xl_zext __builtin_vec_ze_lxvrx +#define vec_xl_sext __builtin_vec_se_lxvrx +#define vec_xst_trunc __builtin_vec_tr_stxvrx #define vec_neg __builtin_vec_neg #define vec_pmsum_be __builtin_vec_vpmsum #define vec_shasigma_be __builtin_crypto_vshasigma diff --git a/gcc/config/rs6000/freebsd64.h b/gcc/config/rs6000/freebsd64.h index c991363..6984ca5 100644 --- a/gcc/config/rs6000/freebsd64.h +++ b/gcc/config/rs6000/freebsd64.h @@ -78,65 +78,7 @@ extern int dot_symbols; #undef SUBSUBTARGET_OVERRIDE_OPTIONS #define SUBSUBTARGET_OVERRIDE_OPTIONS \ - do \ - { \ - if (!global_options_set.x_rs6000_alignment_flags) \ - rs6000_alignment_flags = MASK_ALIGN_NATURAL; \ - if (TARGET_64BIT) \ - { \ - if (DEFAULT_ABI != ABI_AIX) \ - { \ - rs6000_current_abi = ABI_AIX; \ - error (INVALID_64BIT, "call"); \ - } \ - dot_symbols = !strcmp (rs6000_abi_name, "aixdesc"); \ - if (rs6000_isa_flags & OPTION_MASK_RELOCATABLE) \ - { \ - rs6000_isa_flags &= ~OPTION_MASK_RELOCATABLE; \ - error (INVALID_64BIT, "relocatable"); \ - } \ - if (ELFv2_ABI_CHECK) \ - { \ - rs6000_current_abi = ABI_ELFv2; \ - if (dot_symbols) \ - error ("%<-mcall-aixdesc%> incompatible with %<-mabi=elfv2%>"); \ - } \ - if (rs6000_isa_flags & OPTION_MASK_EABI) \ - { \ - rs6000_isa_flags &= ~OPTION_MASK_EABI; \ - error (INVALID_64BIT, "eabi"); \ - } \ - if (TARGET_PROTOTYPE) \ - { \ - target_prototype = 0; \ - error (INVALID_64BIT, "prototype"); \ - } \ - if ((rs6000_isa_flags & OPTION_MASK_POWERPC64) == 0) \ - { \ - rs6000_isa_flags |= OPTION_MASK_POWERPC64; \ - error ("%<-m64%> requires a PowerPC64 cpu"); \ - } \ - if ((rs6000_isa_flags_explicit \ - & OPTION_MASK_MINIMAL_TOC) != 0) \ - { \ - if (global_options_set.x_rs6000_current_cmodel \ - && rs6000_current_cmodel != CMODEL_SMALL) \ - error ("%<-mcmodel%> incompatible with other toc options"); \ - SET_CMODEL (CMODEL_SMALL); \ - } \ - else \ - { \ - if (!global_options_set.x_rs6000_current_cmodel) \ - SET_CMODEL (CMODEL_MEDIUM); \ - if (rs6000_current_cmodel != CMODEL_SMALL) \ - { \ - TARGET_NO_FP_IN_TOC = 0; \ - TARGET_NO_SUM_IN_TOC = 0; \ - } \ - } \ - } \ - } \ - while (0) + do rs6000_linux64_override_options (); while (0) #undef ASM_SPEC #undef LINK_OS_FREEBSD_SPEC diff --git a/gcc/config/rs6000/linux64.h b/gcc/config/rs6000/linux64.h index 2ded330..73b6c01 100644 --- a/gcc/config/rs6000/linux64.h +++ b/gcc/config/rs6000/linux64.h @@ -96,90 +96,7 @@ extern int dot_symbols; #undef SUBSUBTARGET_OVERRIDE_OPTIONS #define SUBSUBTARGET_OVERRIDE_OPTIONS \ - do \ - { \ - if (!global_options_set.x_rs6000_alignment_flags) \ - rs6000_alignment_flags = MASK_ALIGN_NATURAL; \ - if (rs6000_isa_flags & OPTION_MASK_64BIT) \ - { \ - if (DEFAULT_ABI != ABI_AIX) \ - { \ - rs6000_current_abi = ABI_AIX; \ - error (INVALID_64BIT, "call"); \ - } \ - dot_symbols = !strcmp (rs6000_abi_name, "aixdesc"); \ - if (ELFv2_ABI_CHECK) \ - { \ - rs6000_current_abi = ABI_ELFv2; \ - if (dot_symbols) \ - error ("%<-mcall-aixdesc%> incompatible with %<-mabi=elfv2%>"); \ - } \ - if (rs6000_isa_flags & OPTION_MASK_RELOCATABLE) \ - { \ - rs6000_isa_flags &= ~OPTION_MASK_RELOCATABLE; \ - error (INVALID_64BIT, "relocatable"); \ - } \ - if (rs6000_isa_flags & OPTION_MASK_EABI) \ - { \ - rs6000_isa_flags &= ~OPTION_MASK_EABI; \ - error (INVALID_64BIT, "eabi"); \ - } \ - if (TARGET_PROTOTYPE) \ - { \ - target_prototype = 0; \ - error (INVALID_64BIT, "prototype"); \ - } \ - if ((rs6000_isa_flags & OPTION_MASK_POWERPC64) == 0) \ - { \ - rs6000_isa_flags |= OPTION_MASK_POWERPC64; \ - error ("%<-m64%> requires a PowerPC64 cpu"); \ - } \ - if ((rs6000_isa_flags_explicit \ - & OPTION_MASK_MINIMAL_TOC) != 0) \ - { \ - if (global_options_set.x_rs6000_current_cmodel \ - && rs6000_current_cmodel != CMODEL_SMALL) \ - error ("%<-mcmodel incompatible with other toc options%>"); \ - SET_CMODEL (CMODEL_SMALL); \ - } \ - else \ - { \ - if (!global_options_set.x_rs6000_current_cmodel) \ - SET_CMODEL (CMODEL_MEDIUM); \ - if (rs6000_current_cmodel != CMODEL_SMALL) \ - { \ - if (!global_options_set.x_TARGET_NO_FP_IN_TOC) \ - TARGET_NO_FP_IN_TOC \ - = rs6000_current_cmodel == CMODEL_MEDIUM; \ - if (!global_options_set.x_TARGET_NO_SUM_IN_TOC) \ - TARGET_NO_SUM_IN_TOC = 0; \ - } \ - } \ - if (TARGET_PLTSEQ && DEFAULT_ABI != ABI_ELFv2) \ - { \ - if (global_options_set.x_rs6000_pltseq) \ - warning (0, "%qs unsupported for this ABI", \ - "-mpltseq"); \ - rs6000_pltseq = false; \ - } \ - } \ - else \ - { \ - if (!RS6000_BI_ARCH_P) \ - error (INVALID_32BIT, "32"); \ - if (TARGET_PROFILE_KERNEL) \ - { \ - TARGET_PROFILE_KERNEL = 0; \ - error (INVALID_32BIT, "profile-kernel"); \ - } \ - if (global_options_set.x_rs6000_current_cmodel) \ - { \ - SET_CMODEL (CMODEL_SMALL); \ - error (INVALID_32BIT, "cmodel"); \ - } \ - } \ - } \ - while (0) + do rs6000_linux64_override_options (); while (0) #undef ASM_SPEC #undef LINK_OS_LINUX_SPEC diff --git a/gcc/config/rs6000/ppc-asm.h b/gcc/config/rs6000/ppc-asm.h index 48edc99..e0bce9c 100644 --- a/gcc/config/rs6000/ppc-asm.h +++ b/gcc/config/rs6000/ppc-asm.h @@ -262,6 +262,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #undef toc #define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name) +#ifdef __PCREL__ +#define JUMP_TARGET(name) GLUE(FUNC_NAME(name),@notoc) +#define FUNC_START(name) \ + .type FUNC_NAME(name),@function; \ + .globl FUNC_NAME(name); \ +FUNC_NAME(name): \ + .localentry FUNC_NAME(name),1 +#else #define JUMP_TARGET(name) FUNC_NAME(name) #define FUNC_START(name) \ .type FUNC_NAME(name),@function; \ @@ -270,6 +278,7 @@ FUNC_NAME(name): \ 0: addis 2,12,(.TOC.-0b)@ha; \ addi 2,2,(.TOC.-0b)@l; \ .localentry FUNC_NAME(name),.-FUNC_NAME(name) +#endif /* !__PCREL__ */ #define HIDDEN_FUNC(name) \ FUNC_START(name) \ diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index e91a48d..5b05da8 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -1111,7 +1111,7 @@ | RS6000_BTC_UNARY), \ CODE_FOR_ ## ICODE) /* ICODE */ -#define BU_P10_MISC_2(ENUM, NAME, ATTR, ICODE) \ +#define BU_P10_POWERPC64_MISC_2(ENUM, NAME, ATTR, ICODE) \ RS6000_BUILTIN_2 (P10_BUILTIN_ ## ENUM, /* ENUM */ \ "__builtin_" NAME, /* NAME */ \ RS6000_BTM_P10 \ @@ -1145,6 +1145,14 @@ CODE_FOR_ ## ICODE) /* ICODE */ #endif +#define BU_P10V_OVERLOAD_X(ENUM, NAME) \ + RS6000_BUILTIN_X (P10_BUILTIN_VEC_ ## ENUM, /* ENUM */ \ + "__builtin_vec_" NAME, /* NAME */ \ + RS6000_BTM_P10, /* MASK */ \ + (RS6000_BTC_OVERLOADED /* ATTR */ \ + | RS6000_BTC_SPECIAL), \ + CODE_FOR_nothing) /* ICODE */ + /* Power 10 Altivec builtins */ #define BU_P10V_AV_0(ENUM, NAME, ATTR, ICODE) \ @@ -1179,6 +1187,15 @@ | RS6000_BTC_TERNARY), \ CODE_FOR_ ## ICODE) /* ICODE */ +#define BU_P10V_AV_X(ENUM, NAME, ATTR) \ + RS6000_BUILTIN_X (P10_BUILTIN_ ## ENUM, /* ENUM */ \ + "__builtin_altivec_" NAME, /* NAME */ \ + RS6000_BTM_P10, /* MASK */ \ + (RS6000_BTC_ ## ATTR /* ATTR */ \ + | RS6000_BTC_SPECIAL), \ + CODE_FOR_nothing) /* ICODE */ + + /* Insure 0 is not a legitimate index. */ BU_SPECIAL_X (RS6000_BUILTIN_NONE, NULL, 0, RS6000_BTC_MISC) @@ -1474,6 +1491,18 @@ BU_ALTIVEC_X (LVSR, "lvsr", PURE) BU_ALTIVEC_X (LVEBX, "lvebx", PURE) BU_ALTIVEC_X (LVEHX, "lvehx", PURE) BU_ALTIVEC_X (LVEWX, "lvewx", PURE) +BU_P10V_AV_X (SE_LXVRBX, "se_lxvrbx", PURE) +BU_P10V_AV_X (SE_LXVRHX, "se_lxvrhx", PURE) +BU_P10V_AV_X (SE_LXVRWX, "se_lxvrwx", PURE) +BU_P10V_AV_X (SE_LXVRDX, "se_lxvrdx", PURE) +BU_P10V_AV_X (ZE_LXVRBX, "ze_lxvrbx", PURE) +BU_P10V_AV_X (ZE_LXVRHX, "ze_lxvrhx", PURE) +BU_P10V_AV_X (ZE_LXVRWX, "ze_lxvrwx", PURE) +BU_P10V_AV_X (ZE_LXVRDX, "ze_lxvrdx", PURE) +BU_P10V_AV_X (TR_STXVRBX, "tr_stxvrbx", MEM) +BU_P10V_AV_X (TR_STXVRHX, "tr_stxvrhx", MEM) +BU_P10V_AV_X (TR_STXVRWX, "tr_stxvrwx", MEM) +BU_P10V_AV_X (TR_STXVRDX, "tr_stxvrdx", MEM) BU_ALTIVEC_X (LVXL, "lvxl", PURE) BU_ALTIVEC_X (LVXL_V2DF, "lvxl_v2df", PURE) BU_ALTIVEC_X (LVXL_V2DI, "lvxl_v2di", PURE) @@ -1740,6 +1769,9 @@ BU_ALTIVEC_OVERLOAD_X (LDL, "ldl") BU_ALTIVEC_OVERLOAD_X (LVEBX, "lvebx") BU_ALTIVEC_OVERLOAD_X (LVEHX, "lvehx") BU_ALTIVEC_OVERLOAD_X (LVEWX, "lvewx") +BU_P10V_OVERLOAD_X (SE_LXVRX, "se_lxvrx") +BU_P10V_OVERLOAD_X (ZE_LXVRX, "ze_lxvrx") +BU_P10V_OVERLOAD_X (TR_STXVRX, "tr_stxvrx") BU_ALTIVEC_OVERLOAD_X (LVLX, "lvlx") BU_ALTIVEC_OVERLOAD_X (LVLXL, "lvlxl") BU_ALTIVEC_OVERLOAD_X (LVRX, "lvrx") @@ -2727,11 +2759,11 @@ BU_P9_OVERLOAD_2 (CMPRB2, "byte_in_either_range") BU_P9_OVERLOAD_2 (CMPEQB, "byte_in_set") /* Builtins for scalar instructions added in ISA 3.1 (power10). */ -BU_P10_MISC_2 (CFUGED, "cfuged", CONST, cfuged) -BU_P10_MISC_2 (CNTLZDM, "cntlzdm", CONST, cntlzdm) -BU_P10_MISC_2 (CNTTZDM, "cnttzdm", CONST, cnttzdm) -BU_P10_MISC_2 (PDEPD, "pdepd", CONST, pdepd) -BU_P10_MISC_2 (PEXTD, "pextd", CONST, pextd) +BU_P10_POWERPC64_MISC_2 (CFUGED, "cfuged", CONST, cfuged) +BU_P10_POWERPC64_MISC_2 (CNTLZDM, "cntlzdm", CONST, cntlzdm) +BU_P10_POWERPC64_MISC_2 (CNTTZDM, "cnttzdm", CONST, cnttzdm) +BU_P10_POWERPC64_MISC_2 (PDEPD, "pdepd", CONST, pdepd) +BU_P10_POWERPC64_MISC_2 (PEXTD, "pextd", CONST, pextd) /* Builtins for vector instructions added in ISA 3.1 (power10). */ BU_P10V_AV_2 (VCLRLB, "vclrlb", CONST, vclrlb) diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index f5982907..cc1e997 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -597,6 +597,9 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT flags, /* Tell the user if we support the MMA instructions. */ if ((flags & OPTION_MASK_MMA) != 0) rs6000_define_or_undefine_macro (define_p, "__MMA__"); + /* Whether pc-relative code is being generated. */ + if ((flags & OPTION_MASK_PCREL) != 0) + rs6000_define_or_undefine_macro (define_p, "__PCREL__"); } void diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index a8b52083..b044778 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -57,16 +57,14 @@ #include "gimplify.h" #include "gimple-fold.h" #include "gimple-iterator.h" -#include "gimple-ssa.h" +#include "ssa.h" +#include "tree-ssa-propagate.h" #include "builtins.h" #include "tree-vector-builder.h" #if TARGET_XCOFF #include "xcoffout.h" /* get declarations of xcoff_*_section_name */ #endif #include "ppc-auxv.h" -#include "tree-ssa-propagate.h" -#include "tree-vrp.h" -#include "tree-ssanames.h" #include "targhooks.h" #include "opts.h" @@ -1154,6 +1152,65 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { ALTIVEC_BUILTIN_VEC_LVEBX, ALTIVEC_BUILTIN_LVEBX, RS6000_BTI_unsigned_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTQI, 0 }, + /* vector signed__int128 vec_xl_sext (signed long long, signed char *); + vector signed__int128 vec_xl_sext (signed long long, signed short *); + vector signed__int128 vec_xl_sext (signed long long, signed int *); + vector signed__int128 vec_xl_sext (signed long long, signed longlong *); */ + { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRBX, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTQI, 0 }, + { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRHX, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTHI, 0 }, + { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRWX, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTSI, 0 }, + { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRDX, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTDI, 0 }, + { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRDX, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_long_long, 0 }, + + /* vector unsigned__int128 vec_xl_zext (signed long long, unsigned char *); + vector unsigned__int128 vec_xl_zext (signed long long, unsigned short *); + vector unsigned__int128 vec_xl_zext (signed long long, unsigned int *); + vector unsigned__int128 vec_xl_zext (signed long long, unsigned longlong *); */ + { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRBX, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTQI, 0 }, + { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRHX, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTHI, 0 }, + { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRWX, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTSI, 0 }, + { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRDX, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTDI, 0 }, + { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRDX, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_unsigned_long_long, 0 }, + + /* void vec_xst_trunc (vector signed __int128, signed long long, signed char *); + void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned char *); + void vec_xst_trunc (vector signed __int128, signed long long, signed char *); + void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned char *); + void vec_xst_trunc (vector signed __int128, signed long long, signed char *); + void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned char *); + void vec_xst_trunc (vector signed __int128, signed long long, signed char *); + void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned char *); */ + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRBX, RS6000_BTI_void, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTQI }, + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRBX, RS6000_BTI_void, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTQI }, + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRHX, RS6000_BTI_void, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTHI }, + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRHX, RS6000_BTI_void, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTHI }, + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRWX, RS6000_BTI_void, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTSI }, + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRWX, RS6000_BTI_void, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTSI }, + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRDX, RS6000_BTI_void, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_long_long }, + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRDX, RS6000_BTI_void, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_unsigned_long_long }, + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRDX, RS6000_BTI_void, + RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTDI }, + { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRDX, RS6000_BTI_void, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTDI }, + /* vector float vec_ldl (int, vector float *); vector float vec_ldl (int, float *); */ { ALTIVEC_BUILTIN_VEC_LDL, ALTIVEC_BUILTIN_LVXL_V4SF, @@ -9576,6 +9633,85 @@ swap_endian_selector_for_mode (machine_mode mode) gen_rtvec_v (16, perm))); } +/* For the load and sign extend rightmost elements; load and zero extend + rightmost element builtins. */ +static rtx +altivec_expand_lxvr_builtin (enum insn_code icode, tree exp, rtx target, bool blk, bool sign_extend) +{ + rtx pat, addr; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + machine_mode tmode = insn_data[icode].operand[0].mode; + machine_mode smode = insn_data[icode].operand[1].mode; + machine_mode mode0 = Pmode; + machine_mode mode1 = Pmode; + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + + if (icode == CODE_FOR_nothing) + /* Builtin not supported on this processor. */ + return 0; + + /* If we got invalid arguments bail out before generating bad rtl. */ + if (arg0 == error_mark_node || arg1 == error_mark_node) + return const0_rtx; + + if (target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + + op1 = copy_to_mode_reg (mode1, op1); + + if (op0 == const0_rtx) + addr = gen_rtx_MEM (blk ? BLKmode : tmode, op1); + else + { + op0 = copy_to_mode_reg (mode0, op0); + addr = gen_rtx_MEM (blk ? BLKmode : smode, + gen_rtx_PLUS (Pmode, op1, op0)); + } + + if (sign_extend) + { + rtx discratch = gen_reg_rtx (DImode); + rtx tiscratch = gen_reg_rtx (TImode); + + /* Emit the lxvr*x insn. */ + pat = GEN_FCN (icode) (tiscratch, addr); + if (!pat) + return 0; + emit_insn (pat); + + /* Emit a sign extension from QI,HI,WI to double (DI). */ + rtx scratch = gen_lowpart (smode, tiscratch); + if (icode == CODE_FOR_vsx_lxvrbx) + emit_insn (gen_extendqidi2 (discratch, scratch)); + else if (icode == CODE_FOR_vsx_lxvrhx) + emit_insn (gen_extendhidi2 (discratch, scratch)); + else if (icode == CODE_FOR_vsx_lxvrwx) + emit_insn (gen_extendsidi2 (discratch, scratch)); + /* Assign discratch directly if scratch is already DI. */ + if (icode == CODE_FOR_vsx_lxvrdx) + discratch = scratch; + + /* Emit the sign extension from DI (double) to TI (quad). */ + emit_insn (gen_extendditi2 (target, discratch)); + + return target; + } + else + { + /* Zero extend. */ + pat = GEN_FCN (icode) (target, addr); + if (!pat) + return 0; + emit_insn (pat); + return target; + } + return 0; +} + static rtx altivec_expand_lv_builtin (enum insn_code icode, tree exp, rtx target, bool blk) { @@ -9694,7 +9830,7 @@ altivec_expand_stv_builtin (enum insn_code icode, tree exp) rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); rtx op2 = expand_normal (arg2); - rtx pat, addr, rawaddr; + rtx pat, addr, rawaddr, truncrtx; machine_mode tmode = insn_data[icode].operand[0].mode; machine_mode smode = insn_data[icode].operand[1].mode; machine_mode mode1 = Pmode; @@ -9733,6 +9869,25 @@ altivec_expand_stv_builtin (enum insn_code icode, tree exp) emit_insn (gen_rtx_SET (addr, op0)); } + else if (icode == CODE_FOR_vsx_stxvrbx + || icode == CODE_FOR_vsx_stxvrhx + || icode == CODE_FOR_vsx_stxvrwx + || icode == CODE_FOR_vsx_stxvrdx) + { + truncrtx = gen_rtx_TRUNCATE (tmode, op0); + op0 = copy_to_mode_reg (E_TImode, truncrtx); + + if (op1 == const0_rtx) + addr = gen_rtx_MEM (Pmode, op2); + else + { + op1 = copy_to_mode_reg (mode1, op1); + addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op2, op1)); + } + pat = GEN_FCN (icode) (addr, op0); + if (pat) + emit_insn (pat); + } else { if (! (*insn_data[icode].operand[1].predicate) (op0, smode)) @@ -10752,6 +10907,16 @@ altivec_expand_builtin (tree exp, rtx target, bool *expandedp) return altivec_expand_stv_builtin (CODE_FOR_altivec_stvehx, exp); case ALTIVEC_BUILTIN_STVEWX: return altivec_expand_stv_builtin (CODE_FOR_altivec_stvewx, exp); + + case P10_BUILTIN_TR_STXVRBX: + return altivec_expand_stv_builtin (CODE_FOR_vsx_stxvrbx, exp); + case P10_BUILTIN_TR_STXVRHX: + return altivec_expand_stv_builtin (CODE_FOR_vsx_stxvrhx, exp); + case P10_BUILTIN_TR_STXVRWX: + return altivec_expand_stv_builtin (CODE_FOR_vsx_stxvrwx, exp); + case P10_BUILTIN_TR_STXVRDX: + return altivec_expand_stv_builtin (CODE_FOR_vsx_stxvrdx, exp); + case ALTIVEC_BUILTIN_STVXL_V2DF: return altivec_expand_stv_builtin (CODE_FOR_altivec_stvxl_v2df, exp); case ALTIVEC_BUILTIN_STVXL_V2DI: @@ -11014,6 +11179,30 @@ altivec_expand_builtin (tree exp, rtx target, bool *expandedp) case ALTIVEC_BUILTIN_LVEWX: return altivec_expand_lv_builtin (CODE_FOR_altivec_lvewx, exp, target, false); + case P10_BUILTIN_SE_LXVRBX: + return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrbx, + exp, target, false, true); + case P10_BUILTIN_SE_LXVRHX: + return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrhx, + exp, target, false, true); + case P10_BUILTIN_SE_LXVRWX: + return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrwx, + exp, target, false, true); + case P10_BUILTIN_SE_LXVRDX: + return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrdx, + exp, target, false, true); + case P10_BUILTIN_ZE_LXVRBX: + return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrbx, + exp, target, false, false); + case P10_BUILTIN_ZE_LXVRHX: + return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrhx, + exp, target, false, false); + case P10_BUILTIN_ZE_LXVRWX: + return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrwx, + exp, target, false, false); + case P10_BUILTIN_ZE_LXVRDX: + return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrdx, + exp, target, false, false); case ALTIVEC_BUILTIN_LVXL_V2DF: return altivec_expand_lv_builtin (CODE_FOR_altivec_lvxl_v2df, exp, target, false); @@ -12916,15 +13105,13 @@ rs6000_init_builtins (void) /* Vector pair and vector quad support. */ if (TARGET_EXTRA_BUILTINS) { - tree oi_uns_type = make_unsigned_type (256); - vector_pair_type_node = build_distinct_type_copy (oi_uns_type); + vector_pair_type_node = make_unsigned_type (256); SET_TYPE_MODE (vector_pair_type_node, POImode); layout_type (vector_pair_type_node); lang_hooks.types.register_builtin_type (vector_pair_type_node, "__vector_pair"); - tree xi_uns_type = make_unsigned_type (512); - vector_quad_type_node = build_distinct_type_copy (xi_uns_type); + vector_quad_type_node = make_unsigned_type (512); SET_TYPE_MODE (vector_quad_type_node, PXImode); layout_type (vector_quad_type_node); lang_hooks.types.register_builtin_type (vector_quad_type_node, @@ -13298,6 +13485,18 @@ altivec_init_builtins (void) def_builtin ("__builtin_altivec_lvebx", v16qi_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVEBX); def_builtin ("__builtin_altivec_lvehx", v8hi_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVEHX); def_builtin ("__builtin_altivec_lvewx", v4si_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVEWX); + def_builtin ("__builtin_altivec_se_lxvrbx", v16qi_ftype_long_pcvoid, P10_BUILTIN_SE_LXVRBX); + def_builtin ("__builtin_altivec_se_lxvrhx", v8hi_ftype_long_pcvoid, P10_BUILTIN_SE_LXVRHX); + def_builtin ("__builtin_altivec_se_lxvrwx", v4si_ftype_long_pcvoid, P10_BUILTIN_SE_LXVRWX); + def_builtin ("__builtin_altivec_se_lxvrdx", v2di_ftype_long_pcvoid, P10_BUILTIN_SE_LXVRDX); + def_builtin ("__builtin_altivec_ze_lxvrbx", v16qi_ftype_long_pcvoid, P10_BUILTIN_ZE_LXVRBX); + def_builtin ("__builtin_altivec_ze_lxvrhx", v8hi_ftype_long_pcvoid, P10_BUILTIN_ZE_LXVRHX); + def_builtin ("__builtin_altivec_ze_lxvrwx", v4si_ftype_long_pcvoid, P10_BUILTIN_ZE_LXVRWX); + def_builtin ("__builtin_altivec_ze_lxvrdx", v2di_ftype_long_pcvoid, P10_BUILTIN_ZE_LXVRDX); + def_builtin ("__builtin_altivec_tr_stxvrbx", void_ftype_v1ti_long_pvoid, P10_BUILTIN_TR_STXVRBX); + def_builtin ("__builtin_altivec_tr_stxvrhx", void_ftype_v1ti_long_pvoid, P10_BUILTIN_TR_STXVRHX); + def_builtin ("__builtin_altivec_tr_stxvrwx", void_ftype_v1ti_long_pvoid, P10_BUILTIN_TR_STXVRWX); + def_builtin ("__builtin_altivec_tr_stxvrdx", void_ftype_v1ti_long_pvoid, P10_BUILTIN_TR_STXVRDX); def_builtin ("__builtin_altivec_lvxl", v4si_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVXL); def_builtin ("__builtin_altivec_lvxl_v2df", v2df_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVXL_V2DF); @@ -13363,6 +13562,9 @@ altivec_init_builtins (void) def_builtin ("__builtin_vec_lvebx", v16qi_ftype_long_pcvoid, ALTIVEC_BUILTIN_VEC_LVEBX); def_builtin ("__builtin_vec_lvehx", v8hi_ftype_long_pcvoid, ALTIVEC_BUILTIN_VEC_LVEHX); def_builtin ("__builtin_vec_lvewx", v4si_ftype_long_pcvoid, ALTIVEC_BUILTIN_VEC_LVEWX); + def_builtin ("__builtin_vec_se_lxvrx", v1ti_ftype_long_pcvoid, P10_BUILTIN_VEC_SE_LXVRX); + def_builtin ("__builtin_vec_ze_lxvrx", v1ti_ftype_long_pcvoid, P10_BUILTIN_VEC_ZE_LXVRX); + def_builtin ("__builtin_vec_tr_stxvrx", void_ftype_opaque_long_pvoid, P10_BUILTIN_VEC_TR_STXVRX); def_builtin ("__builtin_vec_st", void_ftype_opaque_long_pvoid, ALTIVEC_BUILTIN_VEC_ST); def_builtin ("__builtin_vec_ste", void_ftype_opaque_long_pvoid, ALTIVEC_BUILTIN_VEC_STE); def_builtin ("__builtin_vec_stl", void_ftype_opaque_long_pvoid, ALTIVEC_BUILTIN_VEC_STL); diff --git a/gcc/config/rs6000/rs6000-internal.h b/gcc/config/rs6000/rs6000-internal.h index 9caef01..32681b6 100644 --- a/gcc/config/rs6000/rs6000-internal.h +++ b/gcc/config/rs6000/rs6000-internal.h @@ -32,7 +32,7 @@ typedef struct rs6000_stack { int cr_save_p; /* true if the CR reg needs to be saved */ unsigned int vrsave_mask; /* mask of vec registers to save */ int push_p; /* true if we need to allocate stack space */ - int calls_p; /* true if the function makes any calls */ + int calls_p; /* true if there are non-sibling calls */ int world_save_p; /* true if we're saving *everything*: r13-r31, cr, f14-f31, vrsave, v20-v31 */ enum rs6000_abi abi; /* which ABI to use */ diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c index 0f88ec1..d90cd57 100644 --- a/gcc/config/rs6000/rs6000-logue.c +++ b/gcc/config/rs6000/rs6000-logue.c @@ -714,7 +714,7 @@ rs6000_stack_info (void) info->altivec_size = 16 * (LAST_ALTIVEC_REGNO + 1 - info->first_altivec_reg_save); - /* Does this function call anything? */ + /* Does this function call anything (apart from sibling calls)? */ info->calls_p = (!crtl->is_leaf || cfun->machine->ra_needs_full_frame); /* Determine if we need to save the condition code registers. */ @@ -5479,7 +5479,18 @@ rs6000_expand_split_stack_prologue (void) gcc_assert (flag_split_stack && reload_completed); if (!info->push_p) - return; + { + /* We need the -fsplit-stack prologue for functions that make + tail calls. Tail calls don't count against crtl->is_leaf. + Note that we are called inside a sequence. get_insns will + just return that (as yet empty) sequence, so instead we + access the function rtl with get_topmost_sequence. */ + for (insn = get_topmost_sequence ()->first; insn; insn = NEXT_INSN (insn)) + if (CALL_P (insn)) + break; + if (!insn) + return; + } if (global_regs[29]) { diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 6f204ca..4d528a3 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -59,11 +59,12 @@ #include "gimplify.h" #include "gimple-fold.h" #include "gimple-iterator.h" -#include "gimple-ssa.h" #include "gimple-walk.h" +#include "ssa.h" +#include "tree-vectorizer.h" +#include "tree-ssa-propagate.h" #include "intl.h" #include "tm-constrs.h" -#include "tree-vectorizer.h" #include "target-globals.h" #include "builtins.h" #include "tree-vector-builder.h" @@ -75,9 +76,6 @@ #endif #include "case-cfn-macros.h" #include "ppc-auxv.h" -#include "tree-ssa-propagate.h" -#include "tree-vrp.h" -#include "tree-ssanames.h" #include "rs6000-internal.h" #include "opts.h" @@ -3451,6 +3449,96 @@ rs6000_override_options_after_change (void) flag_cunroll_grow_size = flag_peel_loops || optimize >= 3; } +#ifdef TARGET_USES_LINUX64_OPT +static void +rs6000_linux64_override_options () +{ + if (!global_options_set.x_rs6000_alignment_flags) + rs6000_alignment_flags = MASK_ALIGN_NATURAL; + if (rs6000_isa_flags & OPTION_MASK_64BIT) + { + if (DEFAULT_ABI != ABI_AIX) + { + rs6000_current_abi = ABI_AIX; + error (INVALID_64BIT, "call"); + } + dot_symbols = !strcmp (rs6000_abi_name, "aixdesc"); + if (ELFv2_ABI_CHECK) + { + rs6000_current_abi = ABI_ELFv2; + if (dot_symbols) + error ("%<-mcall-aixdesc%> incompatible with %<-mabi=elfv2%>"); + } + if (rs6000_isa_flags & OPTION_MASK_RELOCATABLE) + { + rs6000_isa_flags &= ~OPTION_MASK_RELOCATABLE; + error (INVALID_64BIT, "relocatable"); + } + if (rs6000_isa_flags & OPTION_MASK_EABI) + { + rs6000_isa_flags &= ~OPTION_MASK_EABI; + error (INVALID_64BIT, "eabi"); + } + if (TARGET_PROTOTYPE) + { + target_prototype = 0; + error (INVALID_64BIT, "prototype"); + } + if ((rs6000_isa_flags & OPTION_MASK_POWERPC64) == 0) + { + rs6000_isa_flags |= OPTION_MASK_POWERPC64; + error ("%<-m64%> requires a PowerPC64 cpu"); + } + if (!global_options_set.x_rs6000_current_cmodel) + SET_CMODEL (CMODEL_MEDIUM); + if ((rs6000_isa_flags_explicit & OPTION_MASK_MINIMAL_TOC) != 0) + { + if (global_options_set.x_rs6000_current_cmodel + && rs6000_current_cmodel != CMODEL_SMALL) + error ("%<-mcmodel incompatible with other toc options%>"); + if (TARGET_MINIMAL_TOC) + SET_CMODEL (CMODEL_SMALL); + else if (TARGET_PCREL + || (PCREL_SUPPORTED_BY_OS + && (rs6000_isa_flags_explicit & OPTION_MASK_PCREL) == 0)) + /* Ignore -mno-minimal-toc. */ + ; + else + SET_CMODEL (CMODEL_SMALL); + } + if (rs6000_current_cmodel != CMODEL_SMALL) + { + if (!global_options_set.x_TARGET_NO_FP_IN_TOC) + TARGET_NO_FP_IN_TOC = rs6000_current_cmodel == CMODEL_MEDIUM; + if (!global_options_set.x_TARGET_NO_SUM_IN_TOC) + TARGET_NO_SUM_IN_TOC = 0; + } + if (TARGET_PLTSEQ && DEFAULT_ABI != ABI_ELFv2) + { + if (global_options_set.x_rs6000_pltseq) + warning (0, "%qs unsupported for this ABI", + "-mpltseq"); + rs6000_pltseq = false; + } + } + else if (TARGET_64BIT) + error (INVALID_32BIT, "32"); + else + { + if (TARGET_PROFILE_KERNEL) + { + profile_kernel = 0; + error (INVALID_32BIT, "profile-kernel"); + } + if (global_options_set.x_rs6000_current_cmodel) + { + SET_CMODEL (CMODEL_SMALL); + error (INVALID_32BIT, "cmodel"); + } + } +} +#endif + /* Override command line options. Combine build-specific configuration information with options @@ -4236,7 +4324,9 @@ rs6000_option_override_internal (bool global_init_p) } /* Enable Altivec ABI for AIX -maltivec. */ - if (TARGET_XCOFF && (TARGET_ALTIVEC || TARGET_VSX)) + if (TARGET_XCOFF + && (TARGET_ALTIVEC || TARGET_VSX) + && !global_options_set.x_rs6000_altivec_abi) { if (main_target_opt != NULL && !main_target_opt->x_rs6000_altivec_abi) error ("target attribute or pragma changes AltiVec ABI"); @@ -5731,7 +5821,7 @@ direct_return (void) /* Helper for num_insns_constant. Calculate number of instructions to load VALUE to a single gpr using combinations of addi, addis, ori, - oris and sldi instructions. */ + oris, sldi and rldimi instructions. */ static int num_insns_constant_gpr (HOST_WIDE_INT value) @@ -5759,7 +5849,7 @@ num_insns_constant_gpr (HOST_WIDE_INT value) high >>= 1; - if (low == 0) + if (low == 0 || low == high) return num_insns_constant_gpr (high) + 1; else if (high == 0) return num_insns_constant_gpr (low) + 1; @@ -8364,7 +8454,7 @@ rs6000_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, low_int = 0; high_int = INTVAL (XEXP (x, 1)) - low_int; sum = force_operand (gen_rtx_PLUS (Pmode, XEXP (x, 0), - GEN_INT (high_int)), 0); + gen_int_mode (high_int, Pmode)), 0); return plus_constant (Pmode, sum, low_int); } else if (GET_CODE (x) == PLUS @@ -9020,15 +9110,21 @@ rs6000_legitimate_address_p (machine_mode mode, rtx x, bool reg_ok_strict) bool reg_offset_p = reg_offset_addressing_ok_p (mode); bool quad_offset_p = mode_supports_dq_form (mode); - /* If this is an unaligned stvx/ldvx type address, discard the outer AND. */ + if (TARGET_ELF && RS6000_SYMBOL_REF_TLS_P (x)) + return 0; + + /* Handle unaligned altivec lvx/stvx type addresses. */ if (VECTOR_MEM_ALTIVEC_OR_VSX_P (mode) && GET_CODE (x) == AND && CONST_INT_P (XEXP (x, 1)) && INTVAL (XEXP (x, 1)) == -16) - x = XEXP (x, 0); + { + x = XEXP (x, 0); + return (legitimate_indirect_address_p (x, reg_ok_strict) + || legitimate_indexed_address_p (x, reg_ok_strict) + || virtual_stack_registers_memory_p (x)); + } - if (TARGET_ELF && RS6000_SYMBOL_REF_TLS_P (x)) - return 0; if (legitimate_indirect_address_p (x, reg_ok_strict)) return 1; if (TARGET_UPDATE @@ -21176,9 +21272,9 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code, return true; } else if ((outer_code == PLUS - && reg_or_add_cint_operand (x, VOIDmode)) + && reg_or_add_cint_operand (x, mode)) || (outer_code == MINUS - && reg_or_sub_cint_operand (x, VOIDmode)) + && reg_or_sub_cint_operand (x, mode)) || ((outer_code == SET || outer_code == IOR || outer_code == XOR) @@ -26957,11 +27053,10 @@ rs6000_const_f32_to_i32 (rtx operand) void rs6000_emit_xxspltidp_v2df (rtx dst, long value) { - printf("rs6000_emit_xxspltidp_v2df called %ld\n", value); - printf("rs6000_emit_xxspltidp_v2df called 0x%lx\n", value); if (((value & 0x7F800000) == 0) && ((value & 0x7FFFFF) != 0)) inform (input_location, - "the result for the xxspltidp instruction is undefined for subnormal input values.\n"); + "the result for the xxspltidp instruction " + "is undefined for subnormal input values"); emit_insn( gen_xxspltidp_v2df_inst (dst, GEN_INT (value))); } diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 694ff70..dc06014 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -11554,7 +11554,7 @@ "" { /* Everything is best done with setbc[r] if available. */ - if (TARGET_POWER10) + if (TARGET_POWER10 && TARGET_ISEL) rs6000_emit_int_cmove (operands[0], operands[1], const1_rtx, const0_rtx); /* Expanding EQ and NE directly to some machine instructions does not help @@ -12697,12 +12697,7 @@ "" { if (rs6000_speculate_indirect_jumps) - { - if (TARGET_32BIT) - emit_jump_insn (gen_tablejumpsi (operands[0], operands[1])); - else - emit_jump_insn (gen_tablejumpdi (operands[0], operands[1])); - } + emit_jump_insn (gen_tablejump_normal (Pmode, operands[0], operands[1])); else { rtx ccreg = gen_reg_rtx (CCmode); @@ -12716,69 +12711,57 @@ DONE; }) -(define_expand "tablejumpsi" - [(set (match_dup 3) - (plus:SI (match_operand:SI 0) - (match_dup 2))) - (parallel [(set (pc) - (match_dup 3)) - (use (label_ref (match_operand 1)))])] - "TARGET_32BIT && rs6000_speculate_indirect_jumps" +(define_expand "@tablejump<mode>_normal" + [(use (match_operand:SI 0)) + (use (match_operand:P 1))] + "rs6000_speculate_indirect_jumps" { + rtx off; operands[0] = force_reg (SImode, operands[0]); - operands[2] = force_reg (SImode, gen_rtx_LABEL_REF (SImode, operands[1])); - operands[3] = gen_reg_rtx (SImode); + if (<MODE>mode == SImode) + off = operands[0]; + else + { + off = gen_reg_rtx (Pmode); + rtx src = gen_rtx_fmt_e (SIGN_EXTEND, Pmode, operands[0]); + emit_move_insn (off, src); + } + + rtx lab = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1])); + rtx addr = gen_reg_rtx (Pmode); + + emit_insn (gen_add<mode>3 (addr, off, lab)); + emit_jump_insn (gen_tablejump_insn_normal (Pmode, addr, operands[1])); + DONE; }) -(define_expand "tablejumpsi_nospec" - [(set (match_dup 4) - (plus:SI (match_operand:SI 0) - (match_dup 3))) - (parallel [(set (pc) - (match_dup 4)) - (use (label_ref (match_operand 1))) - (clobber (match_operand 2))])] - "TARGET_32BIT && !rs6000_speculate_indirect_jumps" +(define_expand "@tablejump<mode>_nospec" + [(use (match_operand:SI 0)) + (use (match_operand:P 1)) + (use (match_operand:CC 2))] + "!rs6000_speculate_indirect_jumps" { + rtx off; operands[0] = force_reg (SImode, operands[0]); - operands[3] = force_reg (SImode, gen_rtx_LABEL_REF (SImode, operands[1])); - operands[4] = gen_reg_rtx (SImode); -}) + if (<MODE>mode == SImode) + off = operands[0]; + else + { + off = gen_reg_rtx (Pmode); + rtx src = gen_rtx_fmt_e (SIGN_EXTEND, Pmode, operands[0]); + emit_move_insn (off, src); + } -(define_expand "tablejumpdi" - [(set (match_dup 4) - (sign_extend:DI (match_operand:SI 0 "lwa_operand"))) - (set (match_dup 3) - (plus:DI (match_dup 4) - (match_dup 2))) - (parallel [(set (pc) - (match_dup 3)) - (use (label_ref (match_operand 1)))])] - "TARGET_64BIT && rs6000_speculate_indirect_jumps" -{ - operands[2] = force_reg (DImode, gen_rtx_LABEL_REF (DImode, operands[1])); - operands[3] = gen_reg_rtx (DImode); - operands[4] = gen_reg_rtx (DImode); -}) + rtx lab = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1])); + rtx addr = gen_reg_rtx (Pmode); -(define_expand "tablejumpdi_nospec" - [(set (match_dup 5) - (sign_extend:DI (match_operand:SI 0 "lwa_operand"))) - (set (match_dup 4) - (plus:DI (match_dup 5) - (match_dup 3))) - (parallel [(set (pc) - (match_dup 4)) - (use (label_ref (match_operand 1))) - (clobber (match_operand 2))])] - "TARGET_64BIT && !rs6000_speculate_indirect_jumps" -{ - operands[3] = force_reg (DImode, gen_rtx_LABEL_REF (DImode, operands[1])); - operands[4] = gen_reg_rtx (DImode); - operands[5] = gen_reg_rtx (DImode); + emit_insn (gen_add<mode>3 (addr, off, lab)); + emit_jump_insn (gen_tablejump_insn_nospec (Pmode, addr, operands[1], + operands[2])); + DONE; }) -(define_insn "*tablejump<mode>_internal1" +(define_insn "@tablejump<mode>_insn_normal" [(set (pc) (match_operand:P 0 "register_operand" "c,*l")) (use (label_ref (match_operand 1)))] @@ -12786,7 +12769,7 @@ "b%T0" [(set_attr "type" "jmpreg")]) -(define_insn "*tablejump<mode>_internal1_nospec" +(define_insn "@tablejump<mode>_insn_nospec" [(set (pc) (match_operand:P 0 "register_operand" "c,*l")) (use (label_ref (match_operand 1))) diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h index d78ddba..4c0fc86 100644 --- a/gcc/config/rs6000/smmintrin.h +++ b/gcc/config/rs6000/smmintrin.h @@ -42,6 +42,36 @@ #include <altivec.h> #include <tmmintrin.h> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi8 (__m128i const __A, int const __D, int const __N) +{ + __v16qi result = (__v16qi)__A; + + result [__N & 0xf] = __D; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi32 (__m128i const __A, int const __D, int const __N) +{ + __v4si result = (__v4si)__A; + + result [__N & 3] = __D; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N) +{ + __v2di result = (__v2di)__A; + + result [__N & 1] = __D; + + return (__m128i) result; +} + extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_extract_epi8 (__m128i __X, const int __N) { diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 4ff5245..d6347db 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -352,6 +352,8 @@ UNSPEC_VSX_FIRST_MISMATCH_EOS_INDEX UNSPEC_XXGENPCV UNSPEC_MTVSBM + UNSPEC_EXTENDDITI2 + UNSPEC_MTVSRD_DITI_W1 UNSPEC_VCNTMB UNSPEC_VEXPAND UNSPEC_VEXTRACT @@ -1253,6 +1255,24 @@ } }) +;; Load rightmost element from load_data +;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx. +(define_insn "vsx_lxvr<wd>x" + [(set (match_operand:TI 0 "vsx_register_operand" "=wa") + (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" "Z")))] + "TARGET_POWER10" + "lxvr<wd>x %x0,%y1" + [(set_attr "type" "vecload")]) + +;; Store rightmost element into store_data +;; using stxvrbx, stxvrhx, strvxwx, strvxdx. +(define_insn "vsx_stxvr<wd>x" + [(set (match_operand:INT_ISA3 0 "memory_operand" "=Z") + (truncate:INT_ISA3 (match_operand:TI 1 "vsx_register_operand" "wa")))] + "TARGET_POWER10" + "stxvr<wd>x %x1,%y0" + [(set_attr "type" "vecstore")]) + ;; Explicit load/store expanders for the builtin functions for lxvd2x, etc., ;; when you really want their element-reversing behavior. (define_insn "vsx_ld_elemrev_v2di" @@ -4795,6 +4815,37 @@ "vextsw2d %0,%1" [(set_attr "type" "vecexts")]) +;; ISA 3.1 vector sign extend +;; Move DI value from GPR to TI mode in VSX register, word 1. +(define_insn "mtvsrdd_diti_w1" + [(set (match_operand:TI 0 "register_operand" "=wa") + (unspec:TI [(match_operand:DI 1 "register_operand" "r")] + UNSPEC_MTVSRD_DITI_W1))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrdd %x0,0,%1" + [(set_attr "type" "vecmove")]) + +;; Sign extend 64-bit value in TI reg, word 1, to 128-bit value in TI reg +(define_insn "extendditi2_vector" + [(set (match_operand:TI 0 "gpc_reg_operand" "=v") + (unspec:TI [(match_operand:TI 1 "gpc_reg_operand" "v")] + UNSPEC_EXTENDDITI2))] + "TARGET_POWER10" + "vextsd2q %0,%1" + [(set_attr "type" "vecexts")]) + +(define_expand "extendditi2" + [(set (match_operand:TI 0 "gpc_reg_operand") + (sign_extend:DI (match_operand:DI 1 "gpc_reg_operand")))] + "TARGET_POWER10" + { + /* Move 64-bit src from GPR to vector reg and sign extend to 128-bits. */ + rtx temp = gen_reg_rtx (TImode); + emit_insn (gen_mtvsrdd_diti_w1 (temp, operands[1])); + emit_insn (gen_extendditi2_vector (operands[0], temp)); + DONE; + }) + ;; ISA 3.0 Binary Floating-Point Support @@ -5659,7 +5710,7 @@ { int i; int vals_le[16] = {15, 14, 0, 0, 13, 12, 0, 0, 11, 10, 0, 0, 9, 8, 0, 0}; - int vals_be[16] = {7, 6, 0, 0, 5, 4, 0, 0, 3, 2, 0, 0, 1, 0, 0, 0}; + int vals_be[16] = {0, 0, 0, 1, 0, 0, 2, 3, 0, 0, 4, 5, 0, 0, 6, 7}; rtx rvals[16]; rtx mask = gen_reg_rtx (V16QImode); @@ -5693,7 +5744,7 @@ "TARGET_P9_VECTOR" { int vals_le[16] = {7, 6, 0, 0, 5, 4, 0, 0, 3, 2, 0, 0, 1, 0, 0, 0}; - int vals_be[16] = {15, 14, 0, 0, 13, 12, 0, 0, 11, 10, 0, 0, 9, 8, 0, 0}; + int vals_be[16] = {0, 0, 8, 9, 0, 0, 10, 11, 0, 0, 12, 13, 0, 0, 14, 15}; int i; rtx rvals[16]; @@ -6035,7 +6086,7 @@ (match_operand:QI 2 "const_0_to_1_operand" "n")] UNSPEC_VCNTMB))] "TARGET_POWER10" - "vcntmb<VSX_MM_SUFFIX> %0,%1,%2" + "vcntmb<wd> %0,%1,%2" [(set_attr "type" "vecsimple")]) (define_insn "vec_extract_<mode>" @@ -6043,7 +6094,7 @@ (unspec:SI [(match_operand:VSX_MM 1 "altivec_register_operand" "v")] UNSPEC_VEXTRACT))] "TARGET_POWER10" - "vextract<VSX_MM_SUFFIX>m %0,%1" + "vextract<wd>m %0,%1" [(set_attr "type" "vecsimple")]) (define_insn "vec_expand_<mode>" @@ -6051,5 +6102,5 @@ (unspec:VSX_MM [(match_operand:VSX_MM 1 "vsx_register_operand" "v")] UNSPEC_VEXPAND))] "TARGET_POWER10" - "vexpand<VSX_MM_SUFFIX>m %0,%1" + "vexpand<wd>m %0,%1" [(set_attr "type" "vecsimple")]) diff --git a/gcc/config/rs6000/vxworks.h b/gcc/config/rs6000/vxworks.h index 771dddf..87ca3af 100644 --- a/gcc/config/rs6000/vxworks.h +++ b/gcc/config/rs6000/vxworks.h @@ -18,10 +18,21 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ -/* Note to future editors: VxWorks is mostly an EABI target. We do - not use rs6000/eabi.h because we would have to override most of - it anyway. However, if you change that file, consider making - analogous changes here too. */ +/* The port comes in two very different flavors at this stage: + + - For 653 (AE) and regular versions prior to VxWorks 7, the port + comes with its own set of definitions, matching a system compiler + configured this way as well as the corresponding run-time + environment. This is essentially an eabi system, so changes to + eabi.h should usually be reflected here. + + - Starting with VxWorks 7 (post SR600), the system environment + was made extremely similar to GNU/Linux and this toolchain is + built on top of the corresponding header files. */ + +/*-------------------------------------------------------------*/ +/* Common definitions first. */ +/*-------------------------------------------------------------*/ /* CPP predefined macros. */ @@ -29,111 +40,156 @@ along with GCC; see the file COPYING3. If not see #define TARGET_OS_CPP_BUILTINS() \ do \ { \ - builtin_define ("__ppc"); \ - builtin_define ("__PPC__"); \ - builtin_define ("__EABI__"); \ builtin_define ("__ELF__"); \ + if (!TARGET_VXWORKS7) \ + builtin_define ("__EABI__"); \ + \ + /* CPU macros, based on what the system compilers do. */ \ + if (!TARGET_VXWORKS7) \ + { \ + builtin_define ("__ppc"); \ + /* Namespace violation below, but the system headers \ + really depend heavily on this. */ \ + builtin_define ("CPU_FAMILY=PPC"); \ + \ + /* __PPC__ isn't actually emitted by the system compiler \ + prior to vx7 but has been advertised by us for ages. */ \ + builtin_define ("__PPC__"); \ + } \ + else \ + { \ + builtin_define ("__PPC__"); \ + builtin_define ("__powerpc__"); \ + if (TARGET_64BIT) \ + { \ + builtin_define ("__PPC64__"); \ + builtin_define ("__powerpc64__"); \ + } \ + else \ + { \ + builtin_define ("__PPC"); \ + builtin_define ("__powerpc"); \ + } \ + } \ + \ + /* Asserts for #cpu and #machine. */ \ + if (TARGET_64BIT) \ + { \ + builtin_assert ("cpu=powerpc64"); \ + builtin_assert ("machine=powerpc64"); \ + } \ + else \ + { \ + builtin_assert ("cpu=powerpc"); \ + builtin_assert ("machine=powerpc"); \ + } \ + \ + /* PowerPC VxWorks specificities. */ \ if (!TARGET_SOFT_FLOAT) \ - builtin_define ("__hardfp"); \ + { \ + builtin_define ("__hardfp"); \ + builtin_define ("_WRS_HARDWARE_FP"); \ + } \ \ - /* C89 namespace violation! */ \ - builtin_define ("CPU_FAMILY=PPC"); \ - \ + /* Common VxWorks and port items. */ \ VXWORKS_OS_CPP_BUILTINS (); \ + TARGET_OS_SYSV_CPP_BUILTINS (); \ } \ while (0) -/* vx6 library path. */ -#if !TARGET_VXWORKS7 -#undef STARTFILE_PREFIX_SPEC -#define STARTFILE_PREFIX_SPEC \ - "%{mrtp:%{!shared:%:getenv(WIND_BASE /target/lib/usr/lib/ppc/PPC32/common)}}" +/* Specific CPU macro definitions expected by the system headers, + inferred from -mcpu requests by the user. Different versions of + VxWorks expect different forms of macros, such as + + -D_VX_CPU=_VX_PPC403 on Vx7 and some variants of Vx6, + -DCPU=PPC403 on all Vx6 and earlier. */ + +#if TARGET_VXWORKS7 +#define VX_CPU_PREFIX "_VX_" +#else +#define VX_CPU_PREFIX "" #endif -/* Only big endian PPC is supported by VxWorks. */ -#undef BYTES_BIG_ENDIAN -#define BYTES_BIG_ENDIAN 1 -#undef WORDS_BIG_ENDIAN -#define WORDS_BIG_ENDIAN 1 +#define VX_CPUDEF(CPUID) \ + ":-D" VX_CPU_PREFIX "CPU=" VX_CPU_PREFIX #CPUID -/* We have to kill off the entire specs set created by rs6000/sysv4.h - and substitute our own set. The top level vxworks.h has done some - of this for us. */ +#define VX_MCPU(CPU,CPUID) \ + "mcpu=" #CPU VX_CPUDEF(CPUID) -#undef SUBTARGET_EXTRA_SPECS #undef CPP_SPEC -#undef CC1_SPEC -#undef ASM_SPEC - -#define SUBTARGET_EXTRA_SPECS /* none needed */ +#define CPP_SPEC \ + "%{!D" VX_CPU_PREFIX "CPU=*:%{" \ + VX_MCPU(403, PPC403) ";" \ + VX_MCPU(405, PPC405) ";" \ + VX_MCPU(440, PPC440) ";" \ + VX_MCPU(464, PPC464) ";" \ + VX_MCPU(476, PPC476) ";" \ + VX_MCPU(603, PPC603) ";" \ + VX_MCPU(604, PPC604) ";" \ + VX_MCPU(860, PPC860) ";" \ + VX_MCPU(e6500, E6500) ";" \ + VX_MCPU(8540, PPC85XX) ";" \ + VX_MCPU(8548, PPC85XX) ";" \ + VX_CPUDEF(PPC604) \ + "}}" \ + VXWORKS_ADDITIONAL_CPP_SPEC /* FIXME: The only reason we allow no -mcpu switch at all is because - config-ml.in insists on a "." multilib. */ -#define CPP_SPEC \ -"%{!DCPU=*: \ - %{mcpu=403 : -DCPU=PPC403 ; \ - mcpu=405 : -DCPU=PPC405 ; \ - mcpu=440 : -DCPU=PPC440 ; \ - mcpu=464 : -DCPU=PPC464 ; \ - mcpu=476 : -DCPU=PPC476 ; \ - mcpu=603 : -DCPU=PPC603 ; \ - mcpu=604 : -DCPU=PPC604 ; \ - mcpu=860 : -DCPU=PPC860 ; \ - mcpu=8540: -DCPU=PPC85XX ; \ - mcpu=8548: -DCPU=PPC85XX ; \ - : -DCPU=PPC604 }}" \ -VXWORKS_ADDITIONAL_CPP_SPEC - -#define CC1_SPEC \ -"%{G*} %{mno-sdata:-msdata=none} %{msdata:-msdata=default} \ - %{mlittle|mlittle-endian:-mstrict-align}" - -#define ASM_SPEC \ -"%(asm_cpu) \ - %{,assembler|,assembler-with-cpp: %{mregnames} %{mno-regnames}} \ - %{mrelocatable} %{mrelocatable-lib} %{" FPIC_SPEC ":-K PIC} -mbig" + config-ml.in insists on a "." multilib. */ #undef LIB_SPEC #define LIB_SPEC VXWORKS_LIB_SPEC -/* For RTPs, leverage linker relaxation. This helps programs referring - to, typically, kernel services too far away for short calls. This is more - precise than -mlongcall and can be overriden with -Wl,--no-relax. */ -#define VXWORKS_RELAX_LINK_SPEC "%{mrtp:--relax}" - -#undef LINK_SPEC -#define LINK_SPEC VXWORKS_LINK_SPEC " " VXWORKS_RELAX_LINK_SPEC - #undef STARTFILE_SPEC #define STARTFILE_SPEC VXWORKS_STARTFILE_SPEC + #undef ENDFILE_SPEC #define ENDFILE_SPEC VXWORKS_ENDFILE_SPEC /* There is no default multilib. */ #undef MULTILIB_DEFAULTS -#undef TARGET_DEFAULT -#define TARGET_DEFAULT (MASK_EABI | MASK_STRICT_ALIGN) +/* No _mcount profiling on VxWorks. */ +#undef FUNCTION_PROFILER +#define FUNCTION_PROFILER(FILE,LABELNO) VXWORKS_FUNCTION_PROFILER(FILE,LABELNO) -#undef PROCESSOR_DEFAULT -#define PROCESSOR_DEFAULT PROCESSOR_PPC604 +/* Initialize library function table. */ +#undef TARGET_INIT_LIBFUNCS +#define TARGET_INIT_LIBFUNCS rs6000_vxworks_init_libfuncs /* Nor sdata, for kernel mode. We use this in SUBSUBTARGET_INITIALIZE_OPTIONS, after rs6000_rtp has been initialized. */ #undef SDATA_DEFAULT_SIZE #define SDATA_DEFAULT_SIZE (TARGET_VXWORKS_RTP ? 8 : 0) -/* Enforce 16-byte alignment for the stack pointer, to permit general - compliance with e.g. Altivec instructions requirements. Make sure - this isn't overruled by the EABI constraints. */ +#undef SUB3TARGET_OVERRIDE_OPTIONS +#define SUB3TARGET_OVERRIDE_OPTIONS \ + do { \ + if (!global_options_set.x_g_switch_value) \ + g_switch_value = SDATA_DEFAULT_SIZE; \ + VXWORKS_OVERRIDE_OPTIONS; \ + } while (0) -#undef STACK_BOUNDARY -#define STACK_BOUNDARY (16*BITS_PER_UNIT) +/* The stack pointer need not be moved while checking the stack. */ +#undef STACK_CHECK_MOVING_SP -#undef PREFERRED_STACK_BOUNDARY -#define PREFERRED_STACK_BOUNDARY STACK_BOUNDARY +/* Define this to be nonzero if static stack checking is supported. */ +#define STACK_CHECK_STATIC_BUILTIN 1 -#undef ABI_STACK_BOUNDARY +/* Room needed to allow exception propagation, from what experiments + and low level observations taught us ... */ +#define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024) + +/* Leverage linker relaxation for RTPs. This helps 32bit programs + referring to kernel services too far away for short calls, is more + precise than -mlongcall and can be overriden with -Wl,--no-relax. */ +#define VXWORKS_RELAX_LINK_SPEC "%{mrtp:--relax}" + +/*-------------------------------------------------------------*/ +/* Pre-VxWorks7 configuration. */ +/*-------------------------------------------------------------*/ + +#if !TARGET_VXWORKS7 #undef RS6000_STARTING_FRAME_OFFSET #define RS6000_STARTING_FRAME_OFFSET \ @@ -146,21 +202,79 @@ VXWORKS_ADDITIONAL_CPP_SPEC RS6000_ALIGN (crtl->outgoing_args_size.to_constant () \ + STACK_POINTER_OFFSET, 16) -#undef SUBSUBTARGET_OVERRIDE_OPTIONS -#define SUBSUBTARGET_OVERRIDE_OPTIONS \ - do { \ - if (!global_options_set.x_g_switch_value) \ - g_switch_value = SDATA_DEFAULT_SIZE; \ - VXWORKS_OVERRIDE_OPTIONS; \ - } while (0) +/* Enforce 16-byte alignment for the stack pointer, to permit general + compliance with e.g. Altivec instructions requirements. Make sure + this isn't overruled by the EABI constraints. */ -/* No _mcount profiling on VxWorks. */ -#undef FUNCTION_PROFILER -#define FUNCTION_PROFILER(FILE,LABELNO) VXWORKS_FUNCTION_PROFILER(FILE,LABELNO) +#undef STACK_BOUNDARY +#define STACK_BOUNDARY (16*BITS_PER_UNIT) -/* Define this to be nonzero if static stack checking is supported. */ -#define STACK_CHECK_STATIC_BUILTIN 1 +#undef PREFERRED_STACK_BOUNDARY +#define PREFERRED_STACK_BOUNDARY STACK_BOUNDARY + +#undef ABI_STACK_BOUNDARY + +#undef STARTFILE_PREFIX_SPEC +#define STARTFILE_PREFIX_SPEC \ + "%{mrtp:%{!shared:%:getenv(WIND_BASE /target/lib/usr/lib/ppc/PPC32/common)}}" + +/* For aggregates passing, use the same, consistent ABI as Linux. */ +#define AGGREGATE_PADDING_FIXED 0 +#define AGGREGATES_PAD_UPWARD_ALWAYS 0 + +#undef ASM_SPEC +#define ASM_SPEC \ +"%(asm_cpu) \ + %{,assembler|,assembler-with-cpp: %{mregnames} %{mno-regnames}} \ + %{mrelocatable} %{mrelocatable-lib} %{" FPIC_SPEC ":-K PIC} -mbig" + +#undef CC1_SPEC +#define CC1_SPEC VXWORKS_CC1_SPEC " \ + %{G*} %{mno-sdata:-msdata=none} %{msdata:-msdata=default} \ + %{mlittle|mlittle-endian:-mstrict-align}" + +#undef LINK_SPEC +#define LINK_SPEC VXWORKS_LINK_SPEC " " VXWORKS_RELAX_LINK_SPEC + +#undef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_EABI | MASK_STRICT_ALIGN) + +#undef PROCESSOR_DEFAULT +#define PROCESSOR_DEFAULT PROCESSOR_PPC604 + +/* Only big endian PPC is supported by VxWorks. */ +#undef BYTES_BIG_ENDIAN +#define BYTES_BIG_ENDIAN 1 + +#undef WORDS_BIG_ENDIAN +#define WORDS_BIG_ENDIAN 1 + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS /* none needed */ + +#else /* TARGET_VXWORKS7 */ + +/*-------------------------------------------------------------*/ +/* Post-VxWorks7 (SR600) configuration. */ +/*-------------------------------------------------------------*/ + +/* VxWorks does not use local symbols for the function entry point. */ +#undef DOT_SYMBOLS +#define DOT_SYMBOLS 0 + +#undef LINK_OS_VXWORKS_SPEC +#define LINK_OS_VXWORKS_SPEC \ + " %{!mrtp:-r} %{mrtp:-q -static} %{!Xbind-lazy:-z now}" + +#undef LINK_OS_EXTRA_SPEC32 +#define LINK_OS_EXTRA_SPEC32 LINK_OS_VXWORKS_SPEC " " VXWORKS_RELAX_LINK_SPEC + +#undef LINK_OS_EXTRA_SPEC64 +#define LINK_OS_EXTRA_SPEC64 LINK_OS_VXWORKS_SPEC + +/* linux64.h enables this, not supported in vxWorks. */ +#undef TARGET_FLOAT128_ENABLE_TYPE +#define TARGET_FLOAT128_ENABLE_TYPE 0 + +#endif /* TARGET_VXWORKS7 */ -/* This platform supports the probing method of stack checking (RTP mode). - 8K is reserved in the stack to propagate exceptions in case of overflow. */ -#define STACK_CHECK_PROTECT 8192 diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h index 6f1bc07..029f728 100644 --- a/gcc/config/s390/s390-protos.h +++ b/gcc/config/s390/s390-protos.h @@ -121,6 +121,7 @@ extern void s390_expand_vec_compare_cc (rtx, enum rtx_code, rtx, rtx, bool); extern enum rtx_code s390_reverse_condition (machine_mode, enum rtx_code); extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx); extern void s390_expand_vec_init (rtx, rtx); +extern rtx s390_build_signbit_mask (machine_mode); extern rtx s390_return_addr_rtx (int, rtx); extern rtx s390_back_chain_rtx (void); extern rtx_insn *s390_emit_call (rtx, rtx, rtx, rtx); diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index c762840..f9b27f9 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -2467,6 +2467,9 @@ s390_contiguous_bitmask_vector_p (rtx op, int *start, int *end) rtx elt; bool b; + /* Handle floats by bitcasting them to ints. */ + op = gen_lowpart (related_int_vector_mode (GET_MODE (op)).require (), op); + gcc_assert (!!start == !!end); if (!const_vec_duplicate_p (op, &elt) || !CONST_INT_P (elt)) @@ -5952,6 +5955,7 @@ s390_expand_vec_strlen (rtx target, rtx string, rtx alignment) rtx temp; rtx len = gen_reg_rtx (QImode); rtx cond; + rtx mem; s390_load_address (str_addr_base_reg, XEXP (string, 0)); emit_move_insn (str_idx_reg, const0_rtx); @@ -5993,10 +5997,10 @@ s390_expand_vec_strlen (rtx target, rtx string, rtx alignment) LABEL_NUSES (loop_start_label) = 1; /* Load 16 bytes of the string into VR. */ - emit_move_insn (str_reg, - gen_rtx_MEM (V16QImode, - gen_rtx_PLUS (Pmode, str_idx_reg, - str_addr_base_reg))); + mem = gen_rtx_MEM (V16QImode, + gen_rtx_PLUS (Pmode, str_idx_reg, str_addr_base_reg)); + set_mem_align (mem, 128); + emit_move_insn (str_reg, mem); if (into_loop_label != NULL_RTX) { emit_label (into_loop_label); @@ -6863,15 +6867,16 @@ s390_expand_vec_init (rtx target, rtx vals) } /* Use vector gen mask or vector gen byte mask if possible. */ - if (all_same && all_const_int - && (XVECEXP (vals, 0, 0) == const0_rtx - || s390_contiguous_bitmask_vector_p (XVECEXP (vals, 0, 0), - NULL, NULL) - || s390_bytemask_vector_p (XVECEXP (vals, 0, 0), NULL))) + if (all_same && all_const_int) { - emit_insn (gen_rtx_SET (target, - gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)))); - return; + rtx vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)); + if (XVECEXP (vals, 0, 0) == const0_rtx + || s390_contiguous_bitmask_vector_p (vec, NULL, NULL) + || s390_bytemask_vector_p (vec, NULL)) + { + emit_insn (gen_rtx_SET (target, vec)); + return; + } } /* Use vector replicate instructions. vlrep/vrepi/vrep */ @@ -6949,6 +6954,30 @@ s390_expand_vec_init (rtx target, rtx vals) } } +/* Emit a vector constant that contains 1s in each element's sign bit position + and 0s in other positions. MODE is the desired constant's mode. */ +extern rtx +s390_build_signbit_mask (machine_mode mode) +{ + /* Generate the integral element mask value. */ + machine_mode inner_mode = GET_MODE_INNER (mode); + int inner_bitsize = GET_MODE_BITSIZE (inner_mode); + wide_int mask_val = wi::set_bit_in_zero (inner_bitsize - 1, inner_bitsize); + + /* Emit the element mask rtx. Use gen_lowpart in order to cast the integral + value to the desired mode. */ + machine_mode int_mode = related_int_vector_mode (mode).require (); + rtx mask = immed_wide_int_const (mask_val, GET_MODE_INNER (int_mode)); + mask = gen_lowpart (inner_mode, mask); + + /* Emit the vector mask rtx by mode the element mask rtx. */ + int nunits = GET_MODE_NUNITS (mode); + rtvec v = rtvec_alloc (nunits); + for (int i = 0; i < nunits; i++) + RTVEC_ELT (v, i) = mask; + return gen_rtx_CONST_VECTOR (mode, v); +} + /* Structure to hold the initial parameters for a compare_and_swap operation in HImode and QImode. */ @@ -16082,12 +16111,13 @@ s390_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) fenv_var = __builtin_s390_efpc (); __builtin_s390_sfpc (fenv_var & mask) */ - tree old_fpc = build2 (MODIFY_EXPR, unsigned_type_node, fenv_var, call_efpc); - tree new_fpc = - build2 (BIT_AND_EXPR, unsigned_type_node, fenv_var, - build_int_cst (unsigned_type_node, - ~(FPC_DXC_MASK | FPC_FLAGS_MASK | - FPC_EXCEPTION_MASK))); + tree old_fpc = build4 (TARGET_EXPR, unsigned_type_node, fenv_var, call_efpc, + NULL_TREE, NULL_TREE); + tree new_fpc + = build2 (BIT_AND_EXPR, unsigned_type_node, fenv_var, + build_int_cst (unsigned_type_node, + ~(FPC_DXC_MASK | FPC_FLAGS_MASK + | FPC_EXCEPTION_MASK))); tree set_new_fpc = build_call_expr (sfpc, 1, new_fpc); *hold = build2 (COMPOUND_EXPR, void_type_node, old_fpc, set_new_fpc); @@ -16106,8 +16136,8 @@ s390_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) __atomic_feraiseexcept ((old_fpc & FPC_FLAGS_MASK) >> FPC_FLAGS_SHIFT); */ old_fpc = create_tmp_var_raw (unsigned_type_node); - tree store_old_fpc = build2 (MODIFY_EXPR, void_type_node, - old_fpc, call_efpc); + tree store_old_fpc = build4 (TARGET_EXPR, void_type_node, old_fpc, call_efpc, + NULL_TREE, NULL_TREE); set_new_fpc = build_call_expr (sfpc, 1, fenv_var); diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 4c3e540..18edea1 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -1391,23 +1391,55 @@ ; (TF|DF|SF|TD|DD|SD) instructions -; FIXME: load and test instructions turn SNaN into QNaN what is not -; acceptable if the target will be used afterwards. On the other hand -; they are quite convenient for implementing comparisons with 0.0. So -; try to enable them via splitter/peephole if the value isn't needed anymore. -; See testcases: load-and-test-fp-1.c and load-and-test-fp-2.c +; load and test instructions turn a signaling NaN into a quiet NaN. Thus they +; may only be used if the target register is dead afterwards or if fast math +; is enabled. The former is done via a peephole optimization. Note, load and +; test instructions may only be used for (in)equality comparisons because +; relational comparisons must treat a quiet NaN like a signaling NaN which is +; not the case for load and test instructions. For fast math insn +; "cmp<mode>_ccs_0_fastmath" applies. +; See testcases load-and-test-fp-{1,2}.c + +(define_peephole2 + [(set (match_operand:FP 0 "register_operand") + (match_operand:FP 1 "const0_operand")) + (set (reg:CCZ CC_REGNUM) + (compare:CCZ (match_operand:FP 2 "register_operand") + (match_operand:FP 3 "register_operand")))] + "TARGET_HARD_FLOAT + && FP_REG_P (operands[2]) + && REGNO (operands[0]) == REGNO (operands[3]) + && peep2_reg_dead_p (2, operands[0]) + && peep2_reg_dead_p (2, operands[2])" + [(parallel + [(set (reg:CCZ CC_REGNUM) + (compare:CCZ (match_dup 2) (match_dup 1))) + (clobber (match_dup 2))])] + "") ; ltxbr, ltdbr, ltebr, ltxtr, ltdtr -(define_insn "*cmp<mode>_ccs_0" - [(set (reg CC_REGNUM) - (compare (match_operand:FP 0 "register_operand" "f") - (match_operand:FP 1 "const0_operand" ""))) - (clobber (match_operand:FP 2 "register_operand" "=0"))] - "s390_match_ccmode(insn, CCSmode) && TARGET_HARD_FLOAT" +(define_insn "*cmp<mode>_ccz_0" + [(set (reg:CCZ CC_REGNUM) + (compare:CCZ (match_operand:FP 0 "register_operand" "f") + (match_operand:FP 1 "const0_operand"))) + (clobber (match_operand:FP 2 "register_operand" "=0"))] + "TARGET_HARD_FLOAT" "lt<xde><bt>r\t%0,%0" [(set_attr "op_type" "RRE") (set_attr "type" "fsimp<mode>")]) +(define_insn "*cmp<mode>_ccs_0_fastmath" + [(set (reg CC_REGNUM) + (compare (match_operand:FP 0 "register_operand" "f") + (match_operand:FP 1 "const0_operand")))] + "s390_match_ccmode (insn, CCSmode) + && TARGET_HARD_FLOAT + && !flag_trapping_math + && !flag_signaling_nans" + "lt<xde><bt>r\t%0,%0" + [(set_attr "op_type" "RRE") + (set_attr "type" "fsimp<mode>")]) + ; VX: TFmode in FPR pairs: use cxbr instead of wfcxb ; cxtr, cdtr, cxbr, cdbr, cebr, cdb, ceb, wfcsb, wfcdb (define_insn "*cmp<mode>_ccs" diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 2573b7d..3c01cd1 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -1425,35 +1425,45 @@ ; Vector copysign, implement using vector select (define_expand "copysign<mode>3" - [(set (match_operand:VFT 0 "register_operand" "") - (if_then_else:VFT - (eq (match_dup 3) - (match_dup 4)) - (match_operand:VFT 1 "register_operand" "") - (match_operand:VFT 2 "register_operand" "")))] + [(set (match_operand:VFT 0 "register_operand" "") + (ior:VFT + (and:VFT (match_operand:VFT 2 "register_operand" "") + (match_dup 3)) + (and:VFT (not:VFT (match_dup 3)) + (match_operand:VFT 1 "register_operand" ""))))] "TARGET_VX" { - int sz = GET_MODE_BITSIZE (GET_MODE_INNER (<MODE>mode)); - int prec = GET_MODE_PRECISION (GET_MODE_INNER (<tointvec>mode)); - wide_int mask_val = wi::shwi (1l << (sz - 1), prec); - - rtx mask = gen_reg_rtx (<tointvec>mode); - - int nunits = GET_MODE_NUNITS (<tointvec>mode); - rtvec v = rtvec_alloc (nunits); - for (int i = 0; i < nunits; i++) - RTVEC_ELT (v, i) = GEN_INT (mask_val.to_shwi ()); - - mask = gen_rtx_CONST_VECTOR (<tointvec>mode, v); - operands[3] = force_reg (<tointvec>mode, mask); - operands[4] = CONST0_RTX (<tointvec>mode); + rtx mask = s390_build_signbit_mask (<MODE>mode); + operands[3] = force_reg (<MODE>mode, mask); }) ;; ;; Integer compares ;; -(define_insn "*vec_cmp<VICMP_HW_OP:code><VI:mode>_nocc" +(define_expand "vec_cmp<VI_HW:mode><VI_HW:mode>" + [(set (match_operand:VI_HW 0 "register_operand" "") + (match_operator:VI_HW 1 "" + [(match_operand:VI_HW 2 "register_operand" "") + (match_operand:VI_HW 3 "register_operand" "")]))] + "TARGET_VX" +{ + s390_expand_vec_compare (operands[0], GET_CODE(operands[1]), operands[2], operands[3]); + DONE; +}) + +(define_expand "vec_cmpu<VI_HW:mode><VI_HW:mode>" + [(set (match_operand:VI_HW 0 "register_operand" "") + (match_operator:VI_HW 1 "" + [(match_operand:VI_HW 2 "register_operand" "") + (match_operand:VI_HW 3 "register_operand" "")]))] + "TARGET_VX" +{ + s390_expand_vec_compare (operands[0], GET_CODE(operands[1]), operands[2], operands[3]); + DONE; +}) + +(define_insn "*vec_cmp<VICMP_HW_OP:code><VI:mode><VI:mode>_nocc" [(set (match_operand:VI 2 "register_operand" "=v") (VICMP_HW_OP:VI (match_operand:VI 0 "register_operand" "v") (match_operand:VI 1 "register_operand" "v")))] diff --git a/gcc/config/t-vxworks b/gcc/config/t-vxworks index fd1fbfd..221f53c 100644 --- a/gcc/config/t-vxworks +++ b/gcc/config/t-vxworks @@ -59,7 +59,7 @@ stmp-int-hdrs: subst-glimits.h subst-%.h: cp -p $(srcdir)/$*.h orig-$*.h ID=$$(echo $(BASEVER_c) | sed -e 's/\./_/g'); \ - sed -e "s/_LIMITS_H__/_LIMITS_H_$${ID}_/" < $(srcdir)/$*.h > $@ + sed -e "s/_LIMITS_H__/_LIMITS_H__$${ID}_/" < $(srcdir)/$*.h > $@ cp $@ $(srcdir)/$*.h # Then arrange to restore the original versions after the standard diff --git a/gcc/config/vx-common.h b/gcc/config/vx-common.h index f4a1ffd..9cd7b3d 100644 --- a/gcc/config/vx-common.h +++ b/gcc/config/vx-common.h @@ -23,8 +23,6 @@ along with GCC; see the file COPYING3. If not see /* Most of these will probably be overridden by subsequent headers. We undefine them here just in case, and define VXWORKS_ versions of each, to be used in port-specific vxworks.h. */ -#undef LIB_SPEC -#undef LINK_SPEC #undef LIBGCC_SPEC #define LIBGCC_SPEC VXWORKS_LIBGCC_SPEC #undef STARTFILE_SPEC diff --git a/gcc/config/vxworks.c b/gcc/config/vxworks.c index 970d504..ca0f5de 100644 --- a/gcc/config/vxworks.c +++ b/gcc/config/vxworks.c @@ -154,8 +154,10 @@ vxworks_override_options (void) targetm.have_ctors_dtors = TARGET_VXWORKS_HAVE_CTORS_DTORS || HAVE_INITFINI_ARRAY_SUPPORT; - /* PIC is only supported for RTPs. */ - if (flag_pic && !TARGET_VXWORKS_RTP) + /* PIC is only supported for RTPs. flags_pic might be < 0 here, in + contexts where the corresponding switches are not processed, + e.g. from --help. We are not generating code in such cases. */ + if (flag_pic > 0 && !TARGET_VXWORKS_RTP) error ("PIC is only supported for RTPs"); /* VxWorks comes with non-gdb debuggers which only support strict diff --git a/gcc/config/vxworks.h b/gcc/config/vxworks.h index e50260b0..b7e5970 100644 --- a/gcc/config/vxworks.h +++ b/gcc/config/vxworks.h @@ -70,6 +70,12 @@ along with GCC; see the file COPYING3. If not see #endif +/* Our ports rely on gnu-user.h, which #defines _POSIX_SOURCE for + C++ by default. VxWorks doesn't provide 100% of what this implies + (e.g. ::mkstemp), so, arrange to prevent that by falling back to + the default CPP spec for C++ as well. */ +#undef CPLUSPLUS_CPP_SPEC + /* For VxWorks static rtps, the system provides libc_internal.a, a superset of libgcc.a that we need to use e.g. to satisfy references to __init and __fini. We still want our libgcc to prevail for symbols it would provide @@ -84,7 +90,7 @@ along with GCC; see the file COPYING3. If not see #define VXWORKS_SYSCALL_LIBS_RTP #if TARGET_VXWORKS7 -#define VXWORKS_NET_LIBS_RTP "-lnet" +#define VXWORKS_NET_LIBS_RTP "-l%:if-exists-then-else(%:getenv(VSB_DIR /usr/h/public/rtnetStackLib.h) rtnet net)" #else #define VXWORKS_NET_LIBS_RTP "-lnet -ldsi" #endif @@ -152,8 +158,7 @@ along with GCC; see the file COPYING3. If not see /* Setup the crtstuff begin/end we might need for dwarf EH registration. */ #if !defined(CONFIG_SJLJ_EXCEPTIONS) && DWARF2_UNWIND_INFO -#define VX_CRTBEGIN_SPEC \ - "%{!mrtp:vx_crtbegin-kernel.o%s} %{mrtp:vx_crtbegin-rtp.o%s}" +#define VX_CRTBEGIN_SPEC "vx_crtbegin.o%s" #define VX_CRTEND_SPEC "-l:vx_crtend.o" #else #define VX_CRTBEGIN_SPEC "" diff --git a/gcc/config/vxworks/_vxworks-versions.h b/gcc/config/vxworks/_vxworks-versions.h index 0aaf547..15e8bfe 100644 --- a/gcc/config/vxworks/_vxworks-versions.h +++ b/gcc/config/vxworks/_vxworks-versions.h @@ -22,17 +22,29 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #ifndef _VXWORKS_VERSIONS_H #define _VXWORKS_VERSIONS_H 1 -/* All we need is access to the bare _WRS_VXWORKS_MAJOR/MINOR macros - exposed by version.h. Cheat a bit to make sure we don't drag additional - header files, which can easily cause #include ordering nightmares. */ +/* All we need is access to the bare _WRS_VXWORKS_MAJOR/MINOR macros, + exposed by version.h or already provided somehow (e.g. with a self + spec for some reason). When resorting to system headers, cheat a + bit to make sure we don't drag additional header files, which can + easily cause #include ordering nightmares. */ +#if !defined(_WRS_VXWORKS_MAJOR) #pragma push_macro("_WRS_KERNEL") #undef _WRS_KERNEL #include <version.h> #pragma pop_macro("_WRS_KERNEL") +#endif + +/* A lot depends on the MAJOR so we really need to make sure we have + that. MINOR is less critical and many environments don't actually + define it unless it is really meaningful (e.g. 6.4 through 6.9). */ #if !defined(_WRS_VXWORKS_MAJOR) -#error "VxWorks version macros needed but not defined" +#error "_WRS_VXWORKS_MAJOR undefined" +#endif + +#if !defined(_WRS_VXWORKS_MINOR) +#define _WRS_VXWORKS_MINOR 0 #endif #define _VXWORKS_MAJOR_GT(MAJOR) (_WRS_VXWORKS_MAJOR > (MAJOR)) |