diff options
Diffstat (limited to 'gcc/config')
71 files changed, 3241 insertions, 1071 deletions
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index 8040409..6f11cc0 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -224,7 +224,7 @@ AARCH64_CORE("neoverse-v3ae", neoversev3ae, cortexa57, V9_2A, (SVE2_BITPERM, RNG AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1) /* NVIDIA ('N') cores. */ -AARCH64_CORE("olympus", olympus, cortexa57, V9_2A, (SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE, FAMINMAX, FP8FMA, FP8DOT2, FP8DOT4, LUT, SVE2_AES, SVE2_SHA3, SVE2_SM4), neoversev3, 0x4e, 0x10, -1) +AARCH64_CORE("olympus", olympus, cortexa57, V9_2A, (SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE, FAMINMAX, FP8FMA, FP8DOT2, FP8DOT4, LUT, SVE2_AES, SVE2_SHA3, SVE2_SM4), olympus, 0x4e, 0x10, -1) /* Armv9-A big.LITTLE processors. */ AARCH64_CORE("gb10", gb10, cortexa57, V9_2A, (SVE2_BITPERM, SVE2_AES, SVE2_SHA3, SVE2_SM4, MEMTAG, PROFILE), cortexx925, 0x41, AARCH64_BIG_LITTLE (0xd85, 0xd87), -1) diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h index c49ff7f..e7926eb 100644 --- a/gcc/config/aarch64/aarch64-cost-tables.h +++ b/gcc/config/aarch64/aarch64-cost-tables.h @@ -125,9 +125,9 @@ const struct cpu_cost_table qdf24xx_extra_costs = { COSTS_N_INSNS (1), /* alu. */ COSTS_N_INSNS (4), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -233,9 +233,9 @@ const struct cpu_cost_table thunderx_extra_costs = { COSTS_N_INSNS (1), /* Alu. */ COSTS_N_INSNS (4), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -340,9 +340,9 @@ const struct cpu_cost_table thunderx2t99_extra_costs = { COSTS_N_INSNS (1), /* Alu. */ COSTS_N_INSNS (4), /* Mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -447,9 +447,9 @@ const struct cpu_cost_table thunderx3t110_extra_costs = { COSTS_N_INSNS (1), /* Alu. */ COSTS_N_INSNS (4), /* Mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -555,9 +555,9 @@ const struct cpu_cost_table tsv110_extra_costs = { COSTS_N_INSNS (1), /* alu. */ COSTS_N_INSNS (4), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -662,9 +662,9 @@ const struct cpu_cost_table a64fx_extra_costs = { COSTS_N_INSNS (1), /* alu. */ COSTS_N_INSNS (4), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -769,9 +769,9 @@ const struct cpu_cost_table ampere1_extra_costs = { COSTS_N_INSNS (3), /* alu. */ COSTS_N_INSNS (3), /* mult. */ - COSTS_N_INSNS (2), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -876,9 +876,9 @@ const struct cpu_cost_table ampere1a_extra_costs = { COSTS_N_INSNS (3), /* alu. */ COSTS_N_INSNS (3), /* mult. */ - COSTS_N_INSNS (2), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -983,9 +983,9 @@ const struct cpu_cost_table ampere1b_extra_costs = { COSTS_N_INSNS (1), /* alu. */ COSTS_N_INSNS (2), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (1), /* dup. */ - COSTS_N_INSNS (1) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (0), /* dup. */ + COSTS_N_INSNS (0) /* extract. */ } }; diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def index 1c3e697..db88df0 100644 --- a/gcc/config/aarch64/aarch64-option-extensions.def +++ b/gcc/config/aarch64/aarch64-option-extensions.def @@ -128,7 +128,9 @@ AARCH64_OPT_FMV_EXTENSION("sha2", SHA2, (SIMD), (), (), "sha1 sha2") AARCH64_FMV_FEATURE("sha3", SHA3, (SHA3)) -AARCH64_OPT_FMV_EXTENSION("aes", AES, (SIMD), (), (), "aes") +AARCH64_OPT_EXTENSION("aes", AES, (SIMD), (), (), "aes") + +AARCH64_FMV_FEATURE("aes", PMULL, (AES)) /* +nocrypto disables AES, SHA2 and SM4, and anything that depends on them (such as SHA3 and the SVE2 crypto extensions). */ @@ -171,8 +173,6 @@ AARCH64_OPT_FMV_EXTENSION("i8mm", I8MM, (SIMD), (), (), "i8mm") instructions. */ AARCH64_OPT_FMV_EXTENSION("bf16", BF16, (FP), (SIMD), (), "bf16") -AARCH64_FMV_FEATURE("rpres", RPRES, ()) - AARCH64_OPT_FMV_EXTENSION("sve", SVE, (SIMD, F16, FCMA), (), (), "sve") /* This specifically does not imply +sve. */ @@ -190,7 +190,7 @@ AARCH64_OPT_FMV_EXTENSION("sve2", SVE2, (SVE), (), (), "sve2") AARCH64_OPT_EXTENSION("sve2-aes", SVE2_AES, (SVE2, AES), (), (), "sveaes") -AARCH64_FMV_FEATURE("sve2-aes", SVE_AES, (SVE2_AES)) +AARCH64_FMV_FEATURE("sve2-aes", SVE_PMULL128, (SVE2_AES)) AARCH64_OPT_EXTENSION("sve2-bitperm", SVE2_BITPERM, (SVE2), (), (), "svebitperm") @@ -245,9 +245,9 @@ AARCH64_OPT_EXTENSION("sme-b16b16", SME_B16B16, (SME2, SVE_B16B16), (), (), "sme AARCH64_OPT_EXTENSION("sme-f16f16", SME_F16F16, (SME2), (), (), "smef16f16") -AARCH64_OPT_EXTENSION("mops", MOPS, (), (), (), "mops") +AARCH64_OPT_FMV_EXTENSION("mops", MOPS, (), (), (), "mops") -AARCH64_OPT_EXTENSION("cssc", CSSC, (), (), (), "cssc") +AARCH64_OPT_FMV_EXTENSION("cssc", CSSC, (), (), (), "cssc") AARCH64_OPT_EXTENSION("cmpbr", CMPBR, (), (), (), "cmpbr") diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index e946e8d..38c307c 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1031,6 +1031,7 @@ rtx aarch64_pfalse_reg (machine_mode); bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *); rtx aarch64_sve_packed_pred (machine_mode); rtx aarch64_sve_fp_pred (machine_mode, rtx *); +rtx aarch64_sve_emit_masked_fp_pred (machine_mode, rtx); void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode); bool aarch64_expand_maskloadstore (rtx *, machine_mode); void aarch64_emit_sve_pred_move (rtx, rtx, rtx); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 270cb2f..8b75c3d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1190,13 +1190,16 @@ [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")] ) +;; Inserting from the zero register into a vector lane is treated as an +;; expensive GP->FP move on all CPUs. Avoid it when optimizing for speed. (define_insn "aarch64_simd_vec_set_zero<mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w") (vec_merge:VALL_F16 (match_operand:VALL_F16 1 "register_operand" "0") (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "") (match_operand:SI 2 "immediate_operand" "i")))] - "TARGET_SIMD && aarch64_exact_log2_inverse (<nunits>, operands[2]) >= 0" + "TARGET_SIMD && aarch64_exact_log2_inverse (<nunits>, operands[2]) >= 0 + && optimize_function_for_size_p (cfun)" { int elt = ENDIAN_LANE_N (<nunits>, aarch64_exact_log2_inverse (<nunits>, diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md index 6b3f439..6b1a747 100644 --- a/gcc/config/aarch64/aarch64-sme.md +++ b/gcc/config/aarch64/aarch64-sme.md @@ -62,6 +62,10 @@ ;; (b) they are sometimes used conditionally, particularly in streaming- ;; compatible code. ;; +;; To prevent the latter from upsetting the assembler, we emit the literal +;; encodings of "SMSTART SM" and "SMSTOP SM" when compiling without +;; TARGET_SME. +;; ;; ========================================================================= ;; ------------------------------------------------------------------------- @@ -161,7 +165,9 @@ (clobber (reg:VNx16BI P14_REGNUM)) (clobber (reg:VNx16BI P15_REGNUM))] "" - "smstart\tsm" + { + return TARGET_SME ? "smstart\tsm" : ".inst 0xd503437f // smstart sm"; + } ) ;; Turn off streaming mode. This clobbers all SVE state. @@ -196,7 +202,9 @@ (clobber (reg:VNx16BI P14_REGNUM)) (clobber (reg:VNx16BI P15_REGNUM))] "" - "smstop\tsm" + { + return TARGET_SME ? "smstop\tsm" : ".inst 0xd503427f // smstop sm"; + } ) ;; ------------------------------------------------------------------------- diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sme.def b/gcc/config/aarch64/aarch64-sve-builtins-sme.def index 8e6aadc..117b70e 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-sme.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-sme.def @@ -92,7 +92,8 @@ DEF_SME_FUNCTION (svstr_zt, str_zt, none, none) DEF_SME_FUNCTION (svzero_zt, inherent_zt, none, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 && AARCH64_FL_FAMINMAX) +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 \ + | AARCH64_FL_FAMINMAX) DEF_SME_FUNCTION_GS (svamin, binary_opt_single_n, all_float, x24, none) DEF_SME_FUNCTION_GS (svamax, binary_opt_single_n, all_float, x24, none) #undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index 2b627a9..01833a8 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -4004,7 +4004,8 @@ rtx function_expander::get_reg_target () { machine_mode target_mode = result_mode (); - if (!possible_target || GET_MODE (possible_target) != target_mode) + if (!possible_target + || !register_operand (possible_target, target_mode)) possible_target = gen_reg_rtx (target_mode); return possible_target; } diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 10aecf1..80a3288 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3752,9 +3752,9 @@ ;; Unpredicated floating-point unary operations. (define_insn "@aarch64_sve_<optab><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w") - (unspec:SVE_FULL_F - [(match_operand:SVE_FULL_F 1 "register_operand" "w")] + [(set (match_operand:SVE_F 0 "register_operand" "=w") + (unspec:SVE_F + [(match_operand:SVE_F 1 "register_operand" "w")] SVE_FP_UNARY))] "TARGET_SVE" "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>" @@ -3762,25 +3762,41 @@ ;; Unpredicated floating-point unary operations. (define_expand "<optab><mode>2" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_dup 2) - (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 1 "register_operand")] + (match_dup 3) + (match_operand:SVE_F 1 "register_operand")] SVE_COND_FP_UNARY_OPTAB))] "TARGET_SVE" { + operands[2] = aarch64_sve_fp_pred (<MODE>mode, &operands[3]); + } +) + +;; FABS and FNEG are non-trapping, so we can always expand with a <VPRED> +;; predicate. It doesn't matter whether the padding bits of a partial +;; vector mode are active or inactive. +(define_expand "<optab><mode>2" + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_dup 2) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_F 1 "register_operand")] + SVE_COND_FP_UNARY_BITWISE))] + "TARGET_SVE" + { operands[2] = aarch64_ptrue_reg (<VPRED>mode); } ) ;; Predicated floating-point unary operations. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") (match_operand:SI 3 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F 2 "register_operand")] + (match_operand:SVE_F 2 "register_operand")] SVE_COND_FP_UNARY))] "TARGET_SVE" {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] @@ -3806,13 +3822,13 @@ ;; Predicated floating-point unary arithmetic, merging with the first input. (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 3) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand")] + (match_operand:SVE_F 2 "register_operand")] SVE_COND_FP_UNARY) (match_dup 2)] UNSPEC_SEL))] @@ -3854,15 +3870,15 @@ ;; as earlyclobber helps to make the instruction more regular to the ;; register allocator. (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand")] + (match_operand:SVE_F 2 "register_operand")] SVE_COND_FP_UNARY) - (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[3])" {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] @@ -5495,27 +5511,25 @@ ;; Split a predicated instruction whose predicate is unused into an ;; unpredicated instruction. (define_split - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (match_operand:SI 4 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (const_int SVE_RELAXED_GP) + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] <SVE_COND_FP>))] - "TARGET_SVE - && reload_completed - && INTVAL (operands[4]) == SVE_RELAXED_GP" + "TARGET_SVE && reload_completed" [(set (match_dup 0) - (SVE_UNPRED_FP_BINARY:SVE_FULL_F_B16B16 (match_dup 2) (match_dup 3)))] + (SVE_UNPRED_FP_BINARY:SVE_F_B16B16 (match_dup 2) (match_dup 3)))] ) ;; Unpredicated floating-point binary operations (post-RA only). ;; These are generated by the split above. (define_insn "*post_ra_<sve_fp_op><mode>3" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand" "=w") - (SVE_UNPRED_FP_BINARY:SVE_FULL_F_B16B16 - (match_operand:SVE_FULL_F_B16B16 1 "register_operand" "w") - (match_operand:SVE_FULL_F_B16B16 2 "register_operand" "w")))] + [(set (match_operand:SVE_F_B16B16 0 "register_operand" "=w") + (SVE_UNPRED_FP_BINARY:SVE_F_B16B16 + (match_operand:SVE_F_B16B16 1 "register_operand" "w") + (match_operand:SVE_F_B16B16 2 "register_operand" "w")))] "TARGET_SVE && reload_completed" "<b><sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>") @@ -5547,10 +5561,10 @@ ;; Unpredicated floating-point binary operations. (define_insn "@aarch64_sve_<optab><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w") - (unspec:SVE_FULL_F - [(match_operand:SVE_FULL_F 1 "register_operand" "w") - (match_operand:SVE_FULL_F 2 "register_operand" "w")] + [(set (match_operand:SVE_F 0 "register_operand" "=w") + (unspec:SVE_F + [(match_operand:SVE_F 1 "register_operand" "w") + (match_operand:SVE_F 2 "register_operand" "w")] SVE_FP_BINARY))] "TARGET_SVE" "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>" @@ -5559,27 +5573,27 @@ ;; Unpredicated floating-point binary operations that need to be predicated ;; for SVE. (define_expand "<optab><mode>3" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_dup 3) - (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 1 "<sve_pred_fp_rhs1_operand>") - (match_operand:SVE_FULL_F_B16B16 2 "<sve_pred_fp_rhs2_operand>")] + (match_dup 4) + (match_operand:SVE_F_B16B16 1 "<sve_pred_fp_rhs1_operand>") + (match_operand:SVE_F_B16B16 2 "<sve_pred_fp_rhs2_operand>")] SVE_COND_FP_BINARY_OPTAB))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>)" { - operands[3] = aarch64_ptrue_reg (<VPRED>mode); + operands[3] = aarch64_sve_fp_pred (<MODE>mode, &operands[4]); } ) ;; Predicated floating-point binary operations that have no immediate forms. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") (match_operand:SI 4 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "register_operand")] SVE_COND_FP_BINARY_REG))] "TARGET_SVE" {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] @@ -5591,30 +5605,33 @@ ;; Predicated floating-point operations with merging. (define_expand "@cond_<optab><mode>" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "<sve_pred_fp_rhs1_operand>") - (match_operand:SVE_FULL_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")] + (match_operand:SVE_F_B16B16 2 "<sve_pred_fp_rhs1_operand>") + (match_operand:SVE_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")] SVE_COND_FP_BINARY) - (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>)" + { + operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]); + } ) ;; Predicated floating-point operations, merging with the first input. (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) (match_dup 2)] UNSPEC_SEL))] @@ -5630,14 +5647,14 @@ ) (define_insn "*cond_<optab><mode>_2_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) (match_dup 2)] UNSPEC_SEL))] @@ -5650,14 +5667,14 @@ ;; Same for operations that take a 1-bit constant. (define_insn_and_rewrite "*cond_<optab><mode>_2_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")] SVE_COND_FP_BINARY_I1) (match_dup 2)] UNSPEC_SEL))] @@ -5673,14 +5690,14 @@ ) (define_insn "*cond_<optab><mode>_2_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")] SVE_COND_FP_BINARY_I1) (match_dup 2)] UNSPEC_SEL))] @@ -5693,14 +5710,14 @@ ;; Predicated floating-point operations, merging with the second input. (define_insn_and_rewrite "*cond_<optab><mode>_3_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) (match_dup 3)] UNSPEC_SEL))] @@ -5716,14 +5733,14 @@ ) (define_insn "*cond_<optab><mode>_3_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) (match_dup 3)] UNSPEC_SEL))] @@ -5736,16 +5753,16 @@ ;; Predicated floating-point operations, merging with an independent value. (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) - (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>) @@ -5780,16 +5797,16 @@ ) (define_insn_and_rewrite "*cond_<optab><mode>_any_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) - (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>) @@ -5818,16 +5835,16 @@ ;; Same for operations that take a 1-bit constant. (define_insn_and_rewrite "*cond_<optab><mode>_any_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")] SVE_COND_FP_BINARY_I1) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" {@ [ cons: =0 , 1 , 2 , 4 ] @@ -5854,16 +5871,16 @@ ) (define_insn_and_rewrite "*cond_<optab><mode>_any_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")] SVE_COND_FP_BINARY_I1) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" {@ [ cons: =0 , 1 , 2 , 4 ] @@ -5892,12 +5909,12 @@ ;; Predicated floating-point addition. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") (match_operand:SI 4 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_operand")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_operand")] SVE_COND_FP_ADD))] "TARGET_SVE" {@ [ cons: =0 , 1 , %2 , 3 , 4 ; attrs: movprfx ] @@ -5914,14 +5931,14 @@ ;; Predicated floating-point addition of a constant, merging with the ;; first input. (define_insn_and_rewrite "*cond_add<mode>_2_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")] UNSPEC_COND_FADD) (match_dup 2)] UNSPEC_SEL))] @@ -5939,14 +5956,14 @@ ) (define_insn "*cond_add<mode>_2_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")] UNSPEC_COND_FADD) (match_dup 2)] UNSPEC_SEL))] @@ -5962,16 +5979,16 @@ ;; Predicated floating-point addition of a constant, merging with an ;; independent value. (define_insn_and_rewrite "*cond_add<mode>_any_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")] UNSPEC_COND_FADD) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" {@ [ cons: =0 , 1 , 2 , 3 , 4 ] @@ -6001,16 +6018,16 @@ ) (define_insn_and_rewrite "*cond_add<mode>_any_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")] UNSPEC_COND_FADD) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" {@ [ cons: =0 , 1 , 2 , 3 , 4 ] @@ -6208,12 +6225,12 @@ ;; Predicated floating-point subtraction. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") (match_operand:SI 4 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_operand") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "aarch64_sve_float_arith_operand") + (match_operand:SVE_F 3 "register_operand")] SVE_COND_FP_SUB))] "TARGET_SVE" {@ [ cons: =0 , 1 , 2 , 3 , 4 ; attrs: movprfx ] @@ -6229,14 +6246,14 @@ ;; Predicated floating-point subtraction from a constant, merging with the ;; second input. (define_insn_and_rewrite "*cond_sub<mode>_3_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate") + (match_operand:SVE_F 3 "register_operand")] UNSPEC_COND_FSUB) (match_dup 3)] UNSPEC_SEL))] @@ -6252,14 +6269,14 @@ ) (define_insn "*cond_sub<mode>_3_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate") + (match_operand:SVE_F 3 "register_operand")] UNSPEC_COND_FSUB) (match_dup 3)] UNSPEC_SEL))] @@ -6273,16 +6290,16 @@ ;; Predicated floating-point subtraction from a constant, merging with an ;; independent value. (define_insn_and_rewrite "*cond_sub<mode>_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate") + (match_operand:SVE_F 3 "register_operand")] UNSPEC_COND_FSUB) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[3], operands[4])" {@ [ cons: =0 , 1 , 3 , 4 ] @@ -6309,16 +6326,16 @@ ) (define_insn_and_rewrite "*cond_sub<mode>_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate") + (match_operand:SVE_F 3 "register_operand")] UNSPEC_COND_FSUB) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[3], operands[4])" {@ [ cons: =0 , 1 , 3 , 4 ] @@ -6631,12 +6648,12 @@ ;; Predicated floating-point multiplication. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") (match_operand:SI 4 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_mul_operand")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_mul_operand")] SVE_COND_FP_MUL))] "TARGET_SVE" {@ [ cons: =0 , 1 , %2 , 3 , 4 ; attrs: movprfx ] @@ -6671,12 +6688,12 @@ ;; ------------------------------------------------------------------------- (define_expand "div<mode>3" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_dup 3) - (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 1 "nonmemory_operand") - (match_operand:SVE_FULL_F 2 "register_operand")] + (match_dup 4) + (match_operand:SVE_F 1 "nonmemory_operand") + (match_operand:SVE_F 2 "register_operand")] UNSPEC_COND_FDIV))] "TARGET_SVE" { @@ -6684,23 +6701,23 @@ DONE; operands[1] = force_reg (<MODE>mode, operands[1]); - operands[3] = aarch64_ptrue_reg (<VPRED>mode); + operands[3] = aarch64_sve_fp_pred (<MODE>mode, &operands[4]); } ) (define_expand "@aarch64_frecpe<mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:SVE_FULL_F 1 "register_operand")] + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:SVE_F 1 "register_operand")] UNSPEC_FRECPE))] "TARGET_SVE" ) (define_expand "@aarch64_frecps<mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:SVE_FULL_F 1 "register_operand") - (match_operand:SVE_FULL_F 2 "register_operand")] + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:SVE_F 1 "register_operand") + (match_operand:SVE_F 2 "register_operand")] UNSPEC_FRECPS))] "TARGET_SVE" ) @@ -6865,12 +6882,12 @@ ;; Predicated floating-point maximum/minimum. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") (match_operand:SI 4 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_maxmin_operand")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_maxmin_operand")] SVE_COND_FP_MAXMIN))] "TARGET_SVE" {@ [ cons: =0 , 1 , %2 , 3 ; attrs: movprfx ] @@ -6899,7 +6916,7 @@ ;; Predicate AND. We can reuse one of the inputs as the GP. ;; Doubling the second operand is the preferred implementation ;; of the MOV alias, so we use that instead of %1/z, %1, %2. -(define_insn "and<mode>3" +(define_insn "@and<mode>3" [(set (match_operand:PRED_ALL 0 "register_operand") (and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand") (match_operand:PRED_ALL 2 "register_operand")))] @@ -7581,29 +7598,29 @@ ;; Unpredicated floating-point ternary operations. (define_expand "<optab><mode>4" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_dup 4) - (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 1 "register_operand") - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_dup 5) + (match_operand:SVE_F_B16B16 1 "register_operand") + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_TERNARY))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>)" { - operands[4] = aarch64_ptrue_reg (<VPRED>mode); + operands[4] = aarch64_sve_fp_pred (<MODE>mode, &operands[5]); } ) ;; Predicated floating-point ternary operations. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") (match_operand:SI 5 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>)" {@ [ cons: =0 , 1 , %2 , 3 , 4 ; attrs: movprfx , is_rev ] @@ -7617,17 +7634,17 @@ ;; Predicated floating-point ternary operations with merging. (define_expand "@cond_<optab><mode>" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) - (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>)" { @@ -7635,20 +7652,22 @@ second of the two. */ if (rtx_equal_p (operands[3], operands[5])) std::swap (operands[2], operands[3]); + + operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]); }) ;; Predicated floating-point ternary operations, merging with the ;; first input. (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "register_operand") - (match_operand:SVE_FULL_F 4 "register_operand")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "register_operand") + (match_operand:SVE_F 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 2)] UNSPEC_SEL))] @@ -7664,15 +7683,15 @@ ) (define_insn "*cond_<optab><mode>_2_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "register_operand") - (match_operand:SVE_FULL_F 4 "register_operand")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "register_operand") + (match_operand:SVE_F 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 2)] UNSPEC_SEL))] @@ -7686,15 +7705,15 @@ ;; Predicated floating-point ternary operations, merging with the ;; third input. (define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 4)] UNSPEC_SEL))] @@ -7710,15 +7729,15 @@ ) (define_insn "*cond_<optab><mode>_4_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 4)] UNSPEC_SEL))] @@ -7732,17 +7751,17 @@ ;; Predicated floating-point ternary operations, merging with an ;; independent value. (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 6) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) - (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>) @@ -7778,17 +7797,17 @@ ) (define_insn_and_rewrite "*cond_<optab><mode>_any_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) - (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>) @@ -8187,20 +8206,23 @@ ;; ;; For unpacked vectors, it doesn't really matter whether SEL uses the ;; the container size or the element size. If SEL used the container size, -;; it would ignore undefined bits of the predicate but would copy the -;; upper (undefined) bits of each container along with the defined bits. -;; If SEL used the element size, it would use undefined bits of the predicate -;; to select between undefined elements in each input vector. Thus the only -;; difference is whether the undefined bits in a container always come from -;; the same input as the defined bits, or whether the choice can vary -;; independently of the defined bits. +;; it would would copy the upper (undefined) bits of each container along +;; with the corresponding defined bits. If SEL used the element size, +;; it would use separate predicate bits to select between the undefined +;; elements in each input vector; these seperate predicate bits might +;; themselves be undefined, depending on the mode of the predicate. +;; +;; Thus the only difference is whether the undefined bits in a container +;; always come from the same input as the defined bits, or whether the +;; choice can vary independently of the defined bits. ;; ;; For the other instructions, using the element size is more natural, ;; so we do that for SEL as well. +;; (define_insn "*vcond_mask_<mode><vpred>" [(set (match_operand:SVE_ALL 0 "register_operand") (unspec:SVE_ALL - [(match_operand:<VPRED> 3 "register_operand") + [(match_operand:<VPRED> 3 "aarch64_predicate_operand") (match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm") (match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] @@ -9653,6 +9675,31 @@ } ) +;; As above, for pairs that are used by the auto-vectorizer only. +(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>_relaxed" + [(set (match_operand:SVE_HSDI 0 "register_operand") + (unspec:SVE_HSDI + [(match_operand:<SVE_HSDI:VPRED> 1 "register_operand") + (unspec:SVE_HSDI + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_PARTIAL_F 2 "register_operand")] + SVE_COND_FCVTI) + (match_operand:SVE_HSDI 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE + && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ &w , Upl , w , 0 ; * ] fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> + [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_HSDI:Vetype>, %1/z, %2.<SVE_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + (define_insn "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>_strict" [(set (match_operand:SVE_FULL_HSDI 0 "register_operand") (unspec:SVE_FULL_HSDI @@ -9706,6 +9753,29 @@ } ) +(define_insn_and_rewrite "*cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx2SI_ONLY:mode>_relaxed" + [(set (match_operand:VNx2SI_ONLY 0 "register_operand") + (unspec:VNx2SI_ONLY + [(match_operand:VNx2BI 1 "register_operand") + (unspec:VNx2SI_ONLY + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:VNx2DF_ONLY 2 "register_operand")] + SVE_COND_FCVTI) + (match_operand:VNx2SI_ONLY 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ &w , Upl , w , 0 ; * ] fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> + [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<VNx2DF_ONLY:Vetype>, %1/z, %2.<VNx2DF_ONLY:Vetype>\;fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + ;; ------------------------------------------------------------------------- ;; ---- [INT<-FP] Packs ;; ------------------------------------------------------------------------- @@ -9857,6 +9927,31 @@ } ) +;; As above, for pairs that are used by the auto-vectorizer only. +(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_HSDI:mode><SVE_PARTIAL_F:mode>_relaxed" + [(set (match_operand:SVE_PARTIAL_F 0 "register_operand") + (unspec:SVE_PARTIAL_F + [(match_operand:<SVE_HSDI:VPRED> 1 "register_operand") + (unspec:SVE_PARTIAL_F + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_HSDI 2 "register_operand")] + SVE_COND_ICVTF) + (match_operand:SVE_PARTIAL_F 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE + && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ &w , Upl , w , 0 ; * ] <su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype> + [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_HSDI:Vetype>, %1/z, %2.<SVE_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + (define_insn "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>_strict" [(set (match_operand:SVE_FULL_F 0 "register_operand") (unspec:SVE_FULL_F @@ -10066,6 +10161,30 @@ } ) +;; As above, for pairs that are used by the auto-vectorizer only. +(define_insn_and_rewrite "*cond_<optab>_trunc<SVE_SDF:mode><SVE_PARTIAL_HSF:mode>" + [(set (match_operand:SVE_PARTIAL_HSF 0 "register_operand") + (unspec:SVE_PARTIAL_HSF + [(match_operand:<SVE_SDF:VPRED> 1 "register_operand") + (unspec:SVE_PARTIAL_HSF + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_SDF 2 "register_operand")] + SVE_COND_FCVT) + (match_operand:SVE_PARTIAL_HSF 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w , Upl , w , 0 ; * ] fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype> + [ ?&w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_SDF:Vetype>, %1/z, %2.<SVE_SDF:Vetype>\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + ;; ------------------------------------------------------------------------- ;; ---- [FP<-FP] Packs (bfloat16) ;; ------------------------------------------------------------------------- @@ -10259,6 +10378,30 @@ } ) +;; As above, for pairs that are used by the auto-vectorizer only. +(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_PARTIAL_HSF:mode><SVE_SDF:mode>_relaxed" + [(set (match_operand:SVE_SDF 0 "register_operand") + (unspec:SVE_SDF + [(match_operand:<SVE_SDF:VPRED> 1 "register_operand") + (unspec:SVE_SDF + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_PARTIAL_HSF 2 "register_operand")] + SVE_COND_FCVT) + (match_operand:SVE_SDF 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w , Upl , w , 0 ; * ] fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype> + [ ?&w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_SDF:Vetype>, %1/z, %2.<SVE_SDF:Vetype>\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + ;; ------------------------------------------------------------------------- ;; ---- [PRED<-PRED] Packs ;; ------------------------------------------------------------------------- diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 8c03e28..31bdd85 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -1346,12 +1346,12 @@ ;; Predicated B16B16 binary operations. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:VNx8BF_ONLY 0 "register_operand") - (unspec:VNx8BF_ONLY - [(match_operand:<VPRED> 1 "register_operand") + [(set (match_operand:SVE_BF 0 "register_operand") + (unspec:SVE_BF + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") (match_operand:SI 4 "aarch64_sve_gp_strictness") - (match_operand:VNx8BF_ONLY 2 "register_operand") - (match_operand:VNx8BF_ONLY 3 "register_operand")] + (match_operand:SVE_BF 2 "register_operand") + (match_operand:SVE_BF 3 "register_operand")] SVE_COND_FP_BINARY_OPTAB))] "TARGET_SSVE_B16B16 && <supports_bf16>" {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx , is_rev ] diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 0485f69..f4a2062 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -356,7 +356,8 @@ static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool); static bool aarch64_builtin_support_vector_misalignment (machine_mode mode, const_tree type, int misalignment, - bool is_packed); + bool is_packed, + bool is_gather_scatter); static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64); static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, aarch64_addr_query_type); @@ -429,6 +430,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = #include "tuning_models/neoversev2.h" #include "tuning_models/neoversev3.h" #include "tuning_models/neoversev3ae.h" +#include "tuning_models/olympus.h" #include "tuning_models/a64fx.h" #include "tuning_models/fujitsu_monaka.h" @@ -3931,6 +3933,33 @@ aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness) return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode)); } +/* PRED is a predicate that governs an operation on DATA_MODE. If DATA_MODE + is a partial vector mode, and if exceptions must be suppressed for its + undefined elements, convert PRED from a container-level predicate to + an element-level predicate and ensure that the undefined elements + are inactive. Make no changes otherwise. + + Return the resultant predicate. */ +rtx +aarch64_sve_emit_masked_fp_pred (machine_mode data_mode, rtx pred) +{ + unsigned int vec_flags = aarch64_classify_vector_mode (data_mode); + if (flag_trapping_math && (vec_flags & VEC_PARTIAL)) + { + /* Generate an element-level mask. */ + rtx mask = aarch64_sve_packed_pred (data_mode); + machine_mode pmode = GET_MODE (mask); + + /* Apply the existing predicate. */ + rtx dst = gen_reg_rtx (pmode); + emit_insn (gen_and3 (pmode, dst, mask, + gen_lowpart (pmode, pred))); + return dst; + } + + return pred; +} + /* Emit a comparison CMP between OP0 and OP1, both of which have mode DATA_MODE, and return the result in a predicate of mode PRED_MODE. Use TARGET as the target register if nonnull and convenient. */ @@ -15854,11 +15883,14 @@ cost_plus: break; case CONST_VECTOR: { - /* Load using MOVI/MVNI. */ - if (aarch64_simd_valid_mov_imm (x)) - *cost = extra_cost->vect.movi; - else /* Load using constant pool. */ - *cost = extra_cost->ldst.load; + if (speed) + { + /* Load using MOVI/MVNI. */ + if (aarch64_simd_valid_mov_imm (x)) + *cost += extra_cost->vect.movi; + else /* Load using constant pool. */ + *cost += extra_cost->ldst.load; + } break; } case VEC_CONCAT: @@ -15867,7 +15899,8 @@ cost_plus: break; case VEC_DUPLICATE: /* Load using a DUP. */ - *cost = extra_cost->vect.dup; + if (speed) + *cost += extra_cost->vect.dup; return false; case VEC_SELECT: { @@ -15875,13 +15908,16 @@ cost_plus: *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed); /* cost subreg of 0 as free, otherwise as DUP */ - rtx op1 = XEXP (x, 1); - if (vec_series_lowpart_p (mode, GET_MODE (op1), op1)) - ; - else if (vec_series_highpart_p (mode, GET_MODE (op1), op1)) - *cost = extra_cost->vect.dup; - else - *cost = extra_cost->vect.extract; + if (speed) + { + rtx op1 = XEXP (x, 1); + if (vec_series_lowpart_p (mode, GET_MODE (op1), op1)) + ; + else if (vec_series_highpart_p (mode, GET_MODE (op1), op1)) + *cost += extra_cost->vect.dup; + else + *cost += extra_cost->vect.extract; + } return true; } default: @@ -17157,8 +17193,8 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info, && STMT_VINFO_DATA_REF (stmt_info)) { stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); - if (stmt_info - && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES) + if (node + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES) return DR_GROUP_SIZE (stmt_info); } return 0; @@ -17429,8 +17465,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, for each element. We therefore need to divide the full-instruction cost by the number of elements in the vector. */ if (kind == scalar_load + && node && sve_costs - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { unsigned int nunits = vect_nunits_for_cost (vectype); /* Test for VNx2 modes, which have 64-bit containers. */ @@ -17442,8 +17479,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, /* Detect cases in which a scalar_store is really storing one element in a scatter operation. */ if (kind == scalar_store + && node && sve_costs - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) return sve_costs->scatter_store_elt_cost; /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ @@ -17699,7 +17737,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && kind == vec_to_scalar && (m_vec_flags & VEC_ADVSIMD) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { auto dr = STMT_VINFO_DATA_REF (stmt_info); tree dr_ref = DR_REF (dr); @@ -17712,7 +17750,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, { if (gimple_vuse (SSA_NAME_DEF_STMT (offset))) { - if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type) + if (SLP_TREE_TYPE (node) == load_vec_info_type) ops->loads += count - 1; else /* Stores want to count both the index to array and data to @@ -17814,7 +17852,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && sve_issue && (kind == scalar_load || kind == scalar_store) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { unsigned int pairs = CEIL (count, 2); ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs; @@ -17969,8 +18007,10 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Check if we've seen an SVE gather/scatter operation and which size. */ if (kind == scalar_load + && node + && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve; if (sve_costs) @@ -19958,8 +19998,9 @@ aarch64_process_one_target_attr (char *arg_str) if (valid) { set_option (&global_options, NULL, p_attr->opt_num, value, - NULL, DK_UNSPECIFIED, input_location, - global_dc); + NULL, + static_cast<int> (diagnostics::kind::unspecified), + input_location, global_dc); } else { @@ -20471,6 +20512,8 @@ aarch64_compare_version_priority (tree decl1, tree decl2) unsigned long _size; // Size of the struct, so it can grow. unsigned long _hwcap; unsigned long _hwcap2; + unsigned long _hwcap3; + unsigned long _hwcap4; } */ @@ -20487,14 +20530,24 @@ build_ifunc_arg_type () tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, get_identifier ("_hwcap2"), long_unsigned_type_node); + tree field4 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier ("_hwcap3"), + long_unsigned_type_node); + tree field5 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier ("_hwcap4"), + long_unsigned_type_node); DECL_FIELD_CONTEXT (field1) = ifunc_arg_type; DECL_FIELD_CONTEXT (field2) = ifunc_arg_type; DECL_FIELD_CONTEXT (field3) = ifunc_arg_type; + DECL_FIELD_CONTEXT (field4) = ifunc_arg_type; + DECL_FIELD_CONTEXT (field5) = ifunc_arg_type; TYPE_FIELDS (ifunc_arg_type) = field1; DECL_CHAIN (field1) = field2; DECL_CHAIN (field2) = field3; + DECL_CHAIN (field3) = field4; + DECL_CHAIN (field4) = field5; layout_type (ifunc_arg_type); @@ -24406,10 +24459,14 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed) static bool aarch64_builtin_support_vector_misalignment (machine_mode mode, const_tree type, int misalignment, - bool is_packed) + bool is_packed, + bool is_gather_scatter) { if (TARGET_SIMD && STRICT_ALIGNMENT) { + if (is_gather_scatter) + return true; + /* Return if movmisalign pattern is not supported for this mode. */ if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing) return false; @@ -24419,7 +24476,8 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode, return false; } return default_builtin_support_vector_misalignment (mode, type, misalignment, - is_packed); + is_packed, + is_gather_scatter); } /* If VALS is a vector constant that can be loaded into a register @@ -31948,9 +32006,43 @@ aarch64_test_sysreg_encoding_clashes (void) static void aarch64_test_sve_folding () { + aarch64_target_switcher switcher (AARCH64_FL_SVE); + tree res = fold_unary (BIT_NOT_EXPR, ssizetype, ssize_int (poly_int64 (1, 1))); ASSERT_TRUE (operand_equal_p (res, ssize_int (poly_int64 (-2, -1)))); + + auto build_v16bi = [](bool a, bool b) + { + rtx_vector_builder builder (VNx16BImode, 2, 1); + builder.quick_push (a ? const1_rtx : const0_rtx); + builder.quick_push (b ? const1_rtx : const0_rtx); + return builder.build (); + }; + rtx v16bi_10 = build_v16bi (1, 0); + rtx v16bi_01 = build_v16bi (0, 1); + + for (auto mode : { VNx8BImode, VNx4BImode, VNx2BImode }) + { + rtx reg = gen_rtx_REG (mode, LAST_VIRTUAL_REGISTER + 1); + rtx subreg = lowpart_subreg (VNx16BImode, reg, mode); + rtx and1 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, and1, VNx16BImode), reg); + rtx and0 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, and0, VNx16BImode), CONST0_RTX (mode)); + + rtx ior1 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, ior1, VNx16BImode), CONSTM1_RTX (mode)); + rtx ior0 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, ior0, VNx16BImode), reg); + + rtx xor1 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_10); + ASSERT_RTX_EQ (lowpart_subreg (mode, xor1, VNx16BImode), + lowpart_subreg (mode, gen_rtx_NOT (VNx16BImode, subreg), + VNx16BImode)); + rtx xor0 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, xor0, VNx16BImode), reg); + } } /* Run all target-specific selftests. */ diff --git a/gcc/config/aarch64/cortex-a57-fma-steering.cc b/gcc/config/aarch64/cortex-a57-fma-steering.cc index fd6da66..f7675be 100644 --- a/gcc/config/aarch64/cortex-a57-fma-steering.cc +++ b/gcc/config/aarch64/cortex-a57-fma-steering.cc @@ -948,6 +948,11 @@ func_fma_steering::analyze () /* Search the chain where this instruction is (one of) the root. */ dest_op_info = insn_rr[INSN_UID (insn)].op_info; + + /* Register rename could fail. */ + if (!dest_op_info) + continue; + dest_regno = REGNO (SET_DEST (PATTERN (insn))); for (i = 0; i < dest_op_info->n_chains; i++) { diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index c59fcd6..8533912 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -463,6 +463,7 @@ (define_mode_iterator VNx8SI_ONLY [VNx8SI]) (define_mode_iterator VNx8SF_ONLY [VNx8SF]) (define_mode_iterator VNx8DI_ONLY [VNx8DI]) +(define_mode_iterator VNx2SI_ONLY [VNx2SI]) (define_mode_iterator VNx4SI_ONLY [VNx4SI]) (define_mode_iterator VNx4SF_ONLY [VNx4SF]) (define_mode_iterator VNx2DI_ONLY [VNx2DI]) @@ -3366,6 +3367,10 @@ (define_int_iterator SVE_INT_UNARY [UNSPEC_REVB UNSPEC_REVH UNSPEC_REVW]) +;; This iterator is currently only used for estimation instructions, +;; which are never generated automatically when -ftrapping-math is true. +;; The iterator is therefore applied unconditionally to partial FP modes. +;; This might need to be revisited if new operations are added in future. (define_int_iterator SVE_FP_UNARY [UNSPEC_FRECPE UNSPEC_RSQRTE]) (define_int_iterator SVE_FP_UNARY_INT [(UNSPEC_FEXPA "TARGET_NON_STREAMING")]) @@ -3378,6 +3383,10 @@ (define_int_iterator SVE_INT_BINARY_MULTI [UNSPEC_SQDMULH UNSPEC_SRSHL UNSPEC_URSHL]) +;; This iterator is currently only used for estimation instructions, +;; which are never generated automatically when -ftrapping-math is true. +;; The iterator is therefore applied unconditionally to partial FP modes. +;; This might need to be revisited if new operations are added in future. (define_int_iterator SVE_FP_BINARY [UNSPEC_FRECPS UNSPEC_RSQRTS]) (define_int_iterator SVE_FP_BINARY_INT [UNSPEC_FTSMUL UNSPEC_FTSSEL]) @@ -3429,9 +3438,10 @@ UNSPEC_FMINQV UNSPEC_FMINNMQV]) -(define_int_iterator SVE_COND_FP_UNARY [UNSPEC_COND_FABS - UNSPEC_COND_FNEG - UNSPEC_COND_FRECPX +(define_int_iterator SVE_COND_FP_UNARY_BITWISE [UNSPEC_COND_FABS + UNSPEC_COND_FNEG]) + +(define_int_iterator SVE_COND_FP_UNARY [UNSPEC_COND_FRECPX UNSPEC_COND_FRINTA UNSPEC_COND_FRINTI UNSPEC_COND_FRINTM @@ -3439,13 +3449,12 @@ UNSPEC_COND_FRINTP UNSPEC_COND_FRINTX UNSPEC_COND_FRINTZ - UNSPEC_COND_FSQRT]) + UNSPEC_COND_FSQRT + SVE_COND_FP_UNARY_BITWISE]) ;; Same as SVE_COND_FP_UNARY, but without codes that have a dedicated ;; <optab><mode>2 expander. -(define_int_iterator SVE_COND_FP_UNARY_OPTAB [UNSPEC_COND_FABS - UNSPEC_COND_FNEG - UNSPEC_COND_FRECPX +(define_int_iterator SVE_COND_FP_UNARY_OPTAB [UNSPEC_COND_FRECPX UNSPEC_COND_FRINTA UNSPEC_COND_FRINTI UNSPEC_COND_FRINTM diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h index f76a250..9eb1a20 100644 --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h @@ -26,7 +26,7 @@ static const struct cpu_addrcost_table generic_armv9_a_addrcost_table = { { - 1, /* hi */ + 0, /* hi */ 0, /* si */ 0, /* di */ 1, /* ti */ diff --git a/gcc/config/aarch64/tuning_models/olympus.h b/gcc/config/aarch64/tuning_models/olympus.h new file mode 100644 index 0000000..268789d --- /dev/null +++ b/gcc/config/aarch64/tuning_models/olympus.h @@ -0,0 +1,210 @@ +/* Tuning model description for the NVIDIA Olympus core. + Copyright The GNU Toolchain Authors. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#ifndef GCC_AARCH64_H_OLYMPUS +#define GCC_AARCH64_H_OLYMPUS + +#include "generic.h" + +static struct cpu_regmove_cost olympus_regmove_cost = +{ + 1, /* GP2GP */ + /* Spilling to int<->fp instead of memory is recommended so set + realistic costs compared to memmov_cost. */ + 3, /* GP2FP */ + 3, /* FP2GP */ + 2 /* FP2FP */ +}; + +static advsimd_vec_cost olympus_advsimd_vector_cost = +{ + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 2, /* ld2_st2_permute_cost */ + 2, /* ld3_st3_permute_cost */ + 3, /* ld4_st4_permute_cost */ + 2, /* permute_cost */ + 5, /* reduc_i8_cost */ + 3, /* reduc_i16_cost */ + 3, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 4, /* reduc_f16_cost */ + 4, /* reduc_f32_cost */ + 4, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + 8, /* vec_to_scalar_cost */ + 4, /* scalar_to_vec_cost */ + 6, /* align_load_cost */ + 6, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ +}; + +static sve_vec_cost olympus_sve_vector_cost = +{ + { + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 2, /* ld2_st2_permute_cost */ + 3, /* ld3_st3_permute_cost */ + 3, /* ld4_st4_permute_cost */ + 2, /* permute_cost */ + 9, /* reduc_i8_cost */ + 8, /* reduc_i16_cost */ + 6, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 8, /* reduc_f16_cost */ + 6, /* reduc_f32_cost */ + 4, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + 8, /* vec_to_scalar_cost */ + 4, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 6, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 3, /* clast_cost */ + 10, /* fadda_f16_cost */ + 6, /* fadda_f32_cost */ + 4, /* fadda_f64_cost */ + 14, /* gather_load_x32_cost */ + 12, /* gather_load_x64_cost */ + 42, /* gather_load_x32_init_cost */ + 24, /* gather_load_x64_init_cost */ + 1 /* scatter_store_elt_cost */ +}; + +static aarch64_scalar_vec_issue_info olympus_scalar_issue_info = +{ + 4, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 8, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ +}; + +static aarch64_advsimd_vec_issue_info olympus_advsimd_issue_info = +{ + { + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 6, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ +}; + +static aarch64_sve_vec_issue_info olympus_sve_issue_info = +{ + { + { + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 6, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ + }, + 2, /* pred_ops_per_cycle */ + 1, /* while_pred_ops */ + 0, /* int_cmp_pred_ops */ + 0, /* fp_cmp_pred_ops */ + 1, /* gather_scatter_pair_general_ops */ + 1 /* gather_scatter_pair_pred_ops */ +}; + +static aarch64_vec_issue_info olympus_vec_issue_info = +{ + &olympus_scalar_issue_info, + &olympus_advsimd_issue_info, + &olympus_sve_issue_info +}; + +/* Olympus costs for vector insn classes. */ +static struct cpu_vector_cost olympus_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 2, /* scalar_fp_stmt_cost */ + 4, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 1, /* cond_taken_branch_cost */ + 1, /* cond_not_taken_branch_cost */ + &olympus_advsimd_vector_cost, /* advsimd */ + &olympus_sve_vector_cost, /* sve */ + &olympus_vec_issue_info /* issue_info */ +}; + +/* Olympus prefetch settings (which disable prefetch). */ +static cpu_prefetch_tune olympus_prefetch_tune = +{ + 0, /* num_slots */ + -1, /* l1_cache_size */ + 64, /* l1_cache_line_size */ + -1, /* l2_cache_size */ + true, /* prefetch_dynamic_strides */ + -1, /* minimum_stride */ + -1 /* default_opt_level */ +}; + +static struct tune_params olympus_tunings = +{ + &cortexa76_extra_costs, + &generic_armv9_a_addrcost_table, + &olympus_regmove_cost, + &olympus_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_128, /* sve_width */ + { 4, /* load_int. */ + 1, /* store_int. */ + 6, /* load_fp. */ + 3, /* store_fp. */ + 5, /* load_pred. */ + 1 /* store_pred. */ + }, /* memmov_cost. */ + 10, /* issue_rate */ + AARCH64_FUSE_NEOVERSE_BASE, /* fusible_ops */ + "32:16", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 8, /* int_reassoc_width. */ + 6, /* fp_reassoc_width. */ + 4, /* fma_reassoc_width. */ + 6, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_BASE + | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ + &olympus_prefetch_tune, + AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ + AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ +}; + +#endif /* GCC_AARCH64_H_OLYMPUS. */ diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h index c7a14b3..0600e59 100644 --- a/gcc/config/arm/aarch-cost-tables.h +++ b/gcc/config/arm/aarch-cost-tables.h @@ -123,9 +123,9 @@ const struct cpu_cost_table generic_extra_costs = { COSTS_N_INSNS (1), /* alu. */ COSTS_N_INSNS (4), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -230,9 +230,9 @@ const struct cpu_cost_table cortexa53_extra_costs = { COSTS_N_INSNS (1), /* alu. */ COSTS_N_INSNS (4), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -337,9 +337,9 @@ const struct cpu_cost_table cortexa57_extra_costs = { COSTS_N_INSNS (1), /* alu. */ COSTS_N_INSNS (4), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -444,9 +444,9 @@ const struct cpu_cost_table cortexa76_extra_costs = { COSTS_N_INSNS (1), /* alu. */ COSTS_N_INSNS (4), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -551,9 +551,9 @@ const struct cpu_cost_table exynosm1_extra_costs = { COSTS_N_INSNS (0), /* alu. */ COSTS_N_INSNS (4), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; @@ -658,9 +658,9 @@ const struct cpu_cost_table xgene1_extra_costs = { COSTS_N_INSNS (2), /* alu. */ COSTS_N_INSNS (8), /* mult. */ - COSTS_N_INSNS (1), /* movi. */ - COSTS_N_INSNS (2), /* dup. */ - COSTS_N_INSNS (2) /* extract. */ + COSTS_N_INSNS (0), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ } }; diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index bde06f3..29b45ae 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -289,7 +289,8 @@ static bool arm_vector_alignment_reachable (const_tree type, bool is_packed); static bool arm_builtin_support_vector_misalignment (machine_mode mode, const_tree type, int misalignment, - bool is_packed); + bool is_packed, + bool is_gather_scatter); static void arm_conditional_register_usage (void); static enum flt_eval_method arm_excess_precision (enum excess_precision_type); static reg_class_t arm_preferred_rename_class (reg_class_t rclass); @@ -30661,12 +30662,16 @@ arm_vector_alignment_reachable (const_tree type, bool is_packed) static bool arm_builtin_support_vector_misalignment (machine_mode mode, const_tree type, int misalignment, - bool is_packed) + bool is_packed, + bool is_gather_scatter) { if (TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access) { HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type); + if (is_gather_scatter) + return true; + if (is_packed) return align == 1; @@ -30683,7 +30688,8 @@ arm_builtin_support_vector_misalignment (machine_mode mode, } return default_builtin_support_vector_misalignment (mode, type, misalignment, - is_packed); + is_packed, + is_gather_scatter); } static void diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc index 284f49d..69df6d2 100644 --- a/gcc/config/avr/avr-passes.cc +++ b/gcc/config/avr/avr-passes.cc @@ -4120,9 +4120,8 @@ avr_optimize_casesi (rtx_insn *insns[5], rtx *xop) JUMP_LABEL (cbranch) = xop[4]; ++LABEL_NUSES (xop[4]); - rtx_insn *seq1 = get_insns (); rtx_insn *last1 = get_last_insn (); - end_sequence (); + rtx_insn *seq1 = end_sequence (); emit_insn_after (seq1, insns[2]); @@ -4141,9 +4140,8 @@ avr_optimize_casesi (rtx_insn *insns[5], rtx *xop) emit_insn (pat_4); - rtx_insn *seq2 = get_insns (); rtx_insn *last2 = get_last_insn (); - end_sequence (); + rtx_insn *seq2 = end_sequence (); emit_insn_after (seq2, insns[3]); @@ -4845,6 +4843,137 @@ avr_pass_fuse_add::execute1 (function *func) ////////////////////////////////////////////////////////////////////////////// +// Fuse 2 move insns after combine. + +static const pass_data avr_pass_data_2moves = +{ + RTL_PASS, // type + "", // name (will be patched) + OPTGROUP_NONE, // optinfo_flags + TV_DF_SCAN, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + 0 // todo_flags_finish +}; + +class avr_pass_2moves : public rtl_opt_pass +{ +public: + avr_pass_2moves (gcc::context *ctxt, const char *name) + : rtl_opt_pass (avr_pass_data_2moves, ctxt) + { + this->name = name; + } + + unsigned int execute (function *func) final override + { + if (optimize && avropt_fuse_move2) + { + bool changed = false; + basic_block bb; + + FOR_EACH_BB_FN (bb, func) + { + changed |= optimize_2moves_bb (bb); + } + + if (changed) + { + df_note_add_problem (); + df_analyze (); + } + } + + return 0; + } + + bool optimize_2moves (rtx_insn *, rtx_insn *); + bool optimize_2moves_bb (basic_block); +}; // avr_pass_2moves + +bool +avr_pass_2moves::optimize_2moves_bb (basic_block bb) +{ + bool changed = false; + rtx_insn *insn1 = nullptr; + rtx_insn *insn2 = nullptr; + rtx_insn *curr; + + FOR_BB_INSNS (bb, curr) + { + if (insn1 && INSN_P (insn1) + && insn2 && INSN_P (insn2)) + changed |= optimize_2moves (insn1, insn2); + + insn1 = insn2; + insn2 = curr; + } + + return changed; +} + +bool +avr_pass_2moves::optimize_2moves (rtx_insn *insn1, rtx_insn *insn2) +{ + bool good = false; + bool bad = false; + rtx set1, dest1, src1; + rtx set2, dest2, src2; + + if ((set1 = single_set (insn1)) + && (set2 = single_set (insn2)) + && (src1 = SET_SRC (set1)) + && REG_P (src2 = SET_SRC (set2)) + && REG_P (dest1 = SET_DEST (set1)) + && REG_P (dest2 = SET_DEST (set2)) + && rtx_equal_p (dest1, src2) + // Now we have: + // insn1: dest1 = src1 + // insn2: dest2 = dest1 + && REGNO (dest1) >= FIRST_PSEUDO_REGISTER + // Paranoia. + && GET_CODE (PATTERN (insn1)) != PARALLEL + && GET_CODE (PATTERN (insn2)) != PARALLEL + && (rtx_equal_p (dest2, src1) + || !reg_overlap_mentioned_p (dest2, src1))) + { + avr_dump ("\n;; Found 2moves:\n%r\n%r\n", insn1, insn2); + avr_dump (";; reg %d: insn uses uids:", REGNO (dest1)); + + // Go check that dest1 is used exactly once, namely by insn2. + + df_ref use = DF_REG_USE_CHAIN (REGNO (dest1)); + for (; use; use = DF_REF_NEXT_REG (use)) + { + rtx_insn *user = DF_REF_INSN (use); + avr_dump (" %d", INSN_UID (user)); + good |= INSN_UID (user) == INSN_UID (insn2); + bad |= INSN_UID (user) != INSN_UID (insn2); + } + avr_dump (".\n"); + + if (good && !bad + // Propagate src1 to insn2: + // insn1: # Deleted + // insn2: dest2 = src1 + && validate_change (insn2, &SET_SRC (set2), src1, false)) + { + SET_INSN_DELETED (insn1); + return true; + } + } + + if (good && !bad) + avr_dump (";; Failed\n"); + + return false; +} + + + +////////////////////////////////////////////////////////////////////////////// // Split insns with nonzero_bits() after combine. static const pass_data avr_pass_data_split_nzb = @@ -5706,6 +5835,14 @@ make_avr_pass_casesi (gcc::context *ctxt) return new avr_pass_casesi (ctxt, "avr-casesi"); } +// Optimize 2 consecutive moves after combine. + +rtl_opt_pass * +make_avr_pass_2moves (gcc::context *ctxt) +{ + return new avr_pass_2moves (ctxt, "avr-2moves"); +} + rtl_opt_pass * make_avr_pass_split_nzb (gcc::context *ctxt) { diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def index eb60a93..d668c7f 100644 --- a/gcc/config/avr/avr-passes.def +++ b/gcc/config/avr/avr-passes.def @@ -74,6 +74,14 @@ INSERT_PASS_BEFORE (pass_free_cfg, 1, avr_pass_recompute_notes); INSERT_PASS_AFTER (pass_expand, 1, avr_pass_casesi); +/* Insn combine may come up with superfluous reg-reg moves, where the combine + people say that these are no problem since reg-alloc is supposed to optimize + them. The issue is that the lower-subreg pass sitting between combine and + reg-alloc may split such moves, coming up with a zoo of subregs which are + only handled poorly by the register allocator. */ + +INSERT_PASS_AFTER (pass_combine, 1, avr_pass_2moves); + /* Some combine insns have nonzero_bits() in their condition, though insns should not use such stuff in their condition. Therefore, we split such insn into something without nonzero_bits() in their condition right after diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h index ca30136..37911e7 100644 --- a/gcc/config/avr/avr-protos.h +++ b/gcc/config/avr/avr-protos.h @@ -208,6 +208,7 @@ extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *); extern rtl_opt_pass *make_avr_pass_ifelse (gcc::context *); extern rtl_opt_pass *make_avr_pass_split_nzb (gcc::context *); extern rtl_opt_pass *make_avr_pass_split_after_peephole2 (gcc::context *); +extern rtl_opt_pass *make_avr_pass_2moves (gcc::context *); #ifdef RTX_CODE extern bool avr_casei_sequence_check_operands (rtx *xop); extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands); diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc index c469297..1fb59b6 100644 --- a/gcc/config/avr/avr.cc +++ b/gcc/config/avr/avr.cc @@ -14418,6 +14418,13 @@ avr_output_addr_vec (rtx_insn *labl, rtx table) // Output the label that precedes the table. ASM_OUTPUT_ALIGN (stream, 1); + + char s_labl[40]; + targetm.asm_out.generate_internal_label (s_labl, "L", + CODE_LABEL_NUMBER (labl)); + ASM_OUTPUT_TYPE_DIRECTIVE (stream, s_labl, + AVR_HAVE_JMP_CALL ? "object" : "function"); + targetm.asm_out.internal_label (stream, "L", CODE_LABEL_NUMBER (labl)); // Output the table's content. @@ -14984,10 +14991,11 @@ avr_addr_space_convert (rtx src, tree type_old, tree type_new) /* Linearize memory: RAM has bit 23 set. When as_new = __flashx then this is basically UB since __flashx mistreats RAM addresses, but there - is no way to bail out. (Though -Waddr-space-convert will tell.) */ + is no way to bail out. (Though -Waddr-space-convert will tell.) + ...but PR121277 is confusing, in particular when NULL is coming in. */ int msb = ADDR_SPACE_GENERIC_P (as_old) - ? 0x80 + ? as_new == ADDR_SPACE_MEMX ? 0x80 : 0x00 : avr_addrspace[as_old].segment; src = force_reg (Pmode, src); @@ -15085,10 +15093,16 @@ avr_convert_to_type (tree type, tree expr) const char *name_old = avr_addrspace[as_old].name; const char *name_new = avr_addrspace[as_new].name; - warning (OPT_Waddr_space_convert, - "conversion from address space %qs to address space %qs", - ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old, - ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new); + // Be relaxed when NULL is used, and when 0x0 stands for + // address 0x0. + bool nowarn = (expr == null_pointer_node + && (as_new == ADDR_SPACE_FLASHX + || as_new == ADDR_SPACE_FLASH)); + if (!nowarn) + warning (OPT_Waddr_space_convert, + "conversion from address space %qs to address space %qs", + ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old, + ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new); return fold_build1_loc (loc, ADDR_SPACE_CONVERT_EXPR, type, expr); } diff --git a/gcc/config/avr/avr.opt b/gcc/config/avr/avr.opt index 9883119..7f6f18c 100644 --- a/gcc/config/avr/avr.opt +++ b/gcc/config/avr/avr.opt @@ -164,6 +164,10 @@ mfuse-move= Target Joined RejectNegative UInteger Var(avropt_fuse_move) Init(0) Optimization IntegerRange(0, 23) -mfuse-move=<0,23> Optimization. Run a post-reload pass that tweaks move instructions. +mfuse-move2 +Target Var(avropt_fuse_move2) Init(0) Optimization +Optimization. Fuse some move insns after insn combine. + mabsdata Target Mask(ABSDATA) Assume that all data in static storage can be accessed by LDS / STS instructions. This option is only useful for reduced Tiny devices like ATtiny40. diff --git a/gcc/config/avr/avr.opt.urls b/gcc/config/avr/avr.opt.urls index 662fdee..87c26b2 100644 --- a/gcc/config/avr/avr.opt.urls +++ b/gcc/config/avr/avr.opt.urls @@ -92,6 +92,9 @@ UrlSuffix(gcc/AVR-Options.html#index-mfuse-move) mfuse-move= UrlSuffix(gcc/AVR-Options.html#index-mfuse-move) +mfuse-move2 +UrlSuffix(gcc/AVR-Options.html#index-mfuse-move2) + mabsdata UrlSuffix(gcc/AVR-Options.html#index-mabsdata) diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc index a34c9e9..4acdd1d 100644 --- a/gcc/config/cris/cris.cc +++ b/gcc/config/cris/cris.cc @@ -3711,9 +3711,11 @@ cris_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs, /* Determine if the source using MOF. If it is, automatically clobbering MOF would cause it to have impossible constraints. */ - /* Look for a use of the MOF constraint letter: h. */ + /* Look for a use of the MOF constraint letter h or a hard register + constraint. */ for (unsigned i = 0, n = constraints.length(); i < n; ++i) - if (strchr (constraints[i], 'h') != NULL) + if (strchr (constraints[i], 'h') != NULL + || strstr (constraints[i], "{mof}") != NULL) return NULL; /* Look for an output or an input that touches MOF. */ diff --git a/gcc/config/epiphany/epiphany.cc b/gcc/config/epiphany/epiphany.cc index 16626f8..f53a643 100644 --- a/gcc/config/epiphany/epiphany.cc +++ b/gcc/config/epiphany/epiphany.cc @@ -2816,12 +2816,16 @@ epiphany_vector_alignment_reachable (const_tree type, bool is_packed) static bool epiphany_support_vector_misalignment (machine_mode mode, const_tree type, - int misalignment, bool is_packed) + int misalignment, bool is_packed, + bool is_gather_scatter) { + if (is_gather_scatter) + return true; if (GET_MODE_SIZE (mode) == 8 && misalignment % 4 == 0) return true; return default_builtin_support_vector_misalignment (mode, type, misalignment, - is_packed); + is_packed, + is_gather_scatter); } /* STRUCTURE_SIZE_BOUNDARY seems a bit crude in how it enlarges small diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h index 0bfc786..0287400 100644 --- a/gcc/config/gcn/gcn-opts.h +++ b/gcc/config/gcn/gcn-opts.h @@ -82,11 +82,18 @@ enum hsaco_attr_type #define TARGET_DPP_FULL !TARGET_RDNA2_PLUS #define TARGET_DPP16 TARGET_RDNA2_PLUS #define TARGET_DPP8 TARGET_RDNA2_PLUS +/* Device requires no manually inserted wait states; that's the + case for RDNA 2, 3 and 3.5 (but not for RNDA 4). */ +#define TARGET_NO_MANUAL_NOPS TARGET_RDNA2_PLUS /* Device requires CDNA1-style manually inserted wait states for AVGPRs. */ #define TARGET_AVGPR_CDNA1_NOPS TARGET_CDNA1 +/* Device requires CDNA3-style manually inserted wait states. */ +#define TARGET_CDNA3_NOPS TARGET_CDNA3 /* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag for non-scalar memory operations. The string starts on purpose with a space. Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used. + Note: on atomics, glc/sc0 denotes whether the pre-op operation should + be used. CDNA3 also uses 'nt' instead of 'slc' and 'sc1' instead of 'scc'; however, there is no non-scalar user so far. */ #define TARGET_GLC_NAME (TARGET_CDNA3 ? " sc0" : " glc") diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 7c4dde1..a34d2e3 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -811,7 +811,7 @@ [(set_attr "type" "vop3a") (set_attr "length" "8") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "write")]) ; FIXME: 64bit operations really should be splitters, but I am not sure how ; to represent vertical subregs. @@ -828,7 +828,7 @@ [(set_attr "type" "vmult") (set_attr "length" "16") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "write")]) (define_expand "vec_set<mode>" [(set (match_operand:V_MOV 0 "register_operand") @@ -854,7 +854,7 @@ [(set_attr "type" "vop3a") (set_attr "length" "8") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "write")]) (define_insn "*vec_set<mode>_1" [(set (match_operand:V_2REG 0 "register_operand" "=v") @@ -871,7 +871,7 @@ [(set_attr "type" "vmult") (set_attr "length" "16") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "write")]) (define_insn "vec_duplicate<mode><exec>" [(set (match_operand:V_1REG 0 "register_operand" "=v") @@ -910,7 +910,7 @@ [(set_attr "type" "vop3a") (set_attr "length" "8") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "read")]) (define_insn "vec_extract<mode><scalar_mode>" [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg") @@ -922,7 +922,7 @@ [(set_attr "type" "vmult") (set_attr "length" "16") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "read")]) (define_insn "vec_extract<mode><scalar_mode>" [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg") @@ -934,7 +934,7 @@ [(set_attr "type" "vmult") (set_attr "length" "32") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "read")]) (define_insn "vec_extract<V_1REG:mode><V_1REG_ALT:mode>_nop" [(set (match_operand:V_1REG_ALT 0 "register_operand" "=v,v") @@ -1133,6 +1133,23 @@ DONE; }) +(define_expand "gather_load<mode><vndi>" + [(match_operand:V_MOV 0 "register_operand") + (match_operand:DI 1 "register_operand") + (match_operand:<VnDI> 2 "register_operand") + (match_operand 3 "immediate_operand") + (match_operand:SI 4 "gcn_alu_operand")] + "" + { + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1], + operands[2], operands[4], + INTVAL (operands[3]), NULL); + + emit_insn (gen_gather<mode>_insn_1offset (operands[0], addr, const0_rtx, + const0_rtx, const0_rtx)); + DONE; + }) + ; Allow any address expression (define_expand "gather<mode>_expr<exec>" [(set (match_operand:V_MOV 0 "register_operand") @@ -1175,6 +1192,7 @@ return buf; } [(set_attr "type" "flat") + (set_attr "flatmemaccess" "load") (set_attr "length" "12") (set_attr "cdna" "*,cdna2,*,cdna2") (set_attr "xnack" "off,off,on,on")]) @@ -1233,6 +1251,7 @@ return buf; } [(set_attr "type" "flat") + (set_attr "flatmemaccess" "load") (set_attr "length" "12") (set_attr "cdna" "*,cdna2,*,cdna2") (set_attr "xnack" "off,off,on,on")]) @@ -1259,6 +1278,23 @@ DONE; }) +(define_expand "scatter_store<mode><vndi>" + [(match_operand:DI 0 "register_operand") + (match_operand:<VnDI> 1 "register_operand") + (match_operand 2 "immediate_operand") + (match_operand:SI 3 "gcn_alu_operand") + (match_operand:V_MOV 4 "register_operand")] + "" + { + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0], + operands[1], operands[3], + INTVAL (operands[2]), NULL); + + emit_insn (gen_scatter<mode>_insn_1offset (addr, const0_rtx, operands[4], + const0_rtx, const0_rtx)); + DONE; + }) + ; Allow any address expression (define_expand "scatter<mode>_expr<exec_scatter>" [(set (mem:BLK (scratch)) @@ -1301,6 +1337,7 @@ return buf; } [(set_attr "type" "flat") + (set_attr "flatmemaccess" "store") (set_attr "length" "12") (set_attr "cdna" "*,cdna2")]) @@ -1356,6 +1393,7 @@ return buf; } [(set_attr "type" "flat") + (set_attr "flatmemaccess" "store") (set_attr "length" "12") (set_attr "cdna" "*,cdna2")]) @@ -1645,6 +1683,39 @@ [(set_attr "type" "vmult") (set_attr "length" "8")]) +(define_insn_and_split "add<mode>3_dup" + [(set (match_operand:V_DI 0 "register_operand" "= v") + (plus:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "register_operand" "SvB")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDb"))) + (clobber (reg:DI VCC_REG)) + (clobber (match_scratch:<VnSI> 3 "=&v"))] + "" + "#" + "gcn_can_split_p (<MODE>mode, operands[0]) + && gcn_can_split_p (<MODE>mode, operands[1]) + && gcn_can_split_p (<MODE>mode, operands[2])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_add<vnsi>3_vcc_dup + (gcn_operand_part (<MODE>mode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (<MODE>mode, operands[2], 0), + vcc)); + emit_insn (gen_vec_duplicate<vnsi> (operands[3], + gcn_operand_part (DImode, operands[1], 1))); + emit_insn (gen_addc<vnsi>3 + (gcn_operand_part (<MODE>mode, operands[0], 1), + operands[3], + gcn_operand_part (<MODE>mode, operands[2], 1), + vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + (define_insn_and_split "add<mode>3_exec" [(set (match_operand:V_DI 0 "register_operand" "= v") (vec_merge:V_DI @@ -1682,6 +1753,49 @@ [(set_attr "type" "vmult") (set_attr "length" "8")]) +(define_insn_and_split "add<mode>3_dup_exec" + [(set (match_operand:V_DI 0 "register_operand" "= v") + (vec_merge:V_DI + (plus:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "register_operand" "SvB")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDb")) + (match_operand:V_DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (reg:DI VCC_REG)) + (clobber (match_scratch:<VnSI> 5 "=&v"))] + "" + "#" + "gcn_can_split_p (<MODE>mode, operands[0]) + && gcn_can_split_p (<MODE>mode, operands[1]) + && gcn_can_split_p (<MODE>mode, operands[2]) + && gcn_can_split_p (<MODE>mode, operands[4])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_add<vnsi>3_vcc_dup_exec + (gcn_operand_part (<MODE>mode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (<MODE>mode, operands[2], 0), + vcc, + gcn_operand_part (<MODE>mode, operands[3], 0), + operands[4])); + emit_insn (gen_vec_duplicate<vnsi>_exec (operands[5], + gcn_operand_part (DImode, operands[1], 1), + gcn_gen_undef (<VnSI>mode), + operands[4])); + emit_insn (gen_addc<vnsi>3_exec + (gcn_operand_part (<MODE>mode, operands[0], 1), + operands[5], + gcn_operand_part (<MODE>mode, operands[2], 1), + vcc, vcc, + gcn_operand_part (<MODE>mode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + (define_insn_and_split "sub<mode>3" [(set (match_operand:V_DI 0 "register_operand" "= v, v") (minus:V_DI @@ -2187,6 +2301,22 @@ [(set_attr "type" "vop3a") (set_attr "length" "8")]) +(define_insn "<su>mul<mode>3_highpart_dup<exec>" + [(set (match_operand:V_SI 0 "register_operand" "= v") + (truncate:V_SI + (lshiftrt:<VnDI> + (mult:<VnDI> + (any_extend:<VnDI> + (vec_duplicate:V_SI + (match_operand:SI 1 "gcn_alu_operand" "SvA"))) + (any_extend:<VnDI> + (match_operand:V_SI 2 "gcn_alu_operand" " vA"))) + (const_int 32))))] + "" + "v_mul_hi<sgnsuffix>0\t%0, %2, %1" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + (define_insn "mul<mode>3<exec>" [(set (match_operand:V_INT_1REG 0 "register_operand" "= v") (mult:V_INT_1REG @@ -2198,11 +2328,11 @@ (set_attr "length" "8")]) (define_insn "mul<mode>3_dup<exec>" - [(set (match_operand:V_INT_1REG 0 "register_operand" "= v") + [(set (match_operand:V_INT_1REG 0 "register_operand" "= v") (mult:V_INT_1REG - (match_operand:V_INT_1REG 1 "gcn_alu_operand" "%vSvA") (vec_duplicate:V_INT_1REG - (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand" " SvA"))))] + (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvA")) + (match_operand:V_INT_1REG 2 "gcn_alu_operand" " vA")))] "" "v_mul_lo_u32\t%0, %1, %2" [(set_attr "type" "vop3a") @@ -2238,6 +2368,37 @@ DONE; }) +(define_insn_and_split "mul<mode>3_dup" + [(set (match_operand:V_DI 0 "register_operand" "=&v") + (mult:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "gcn_alu_operand" " Sv")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDA"))) + (clobber (match_scratch:<VnSI> 3 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (<MODE>mode, operands[0], 0); + rtx out_hi = gcn_operand_part (<MODE>mode, operands[0], 1); + rtx left_lo = gcn_operand_part (DImode, operands[1], 0); + rtx left_hi = gcn_operand_part (DImode, operands[1], 1); + rtx right_lo = gcn_operand_part (<MODE>mode, operands[2], 0); + rtx right_hi = gcn_operand_part (<MODE>mode, operands[2], 1); + rtx tmp = operands[3]; + + emit_insn (gen_mul<vnsi>3_dup (out_lo, left_lo, right_lo)); + emit_insn (gen_umul<vnsi>3_highpart_dup (out_hi, left_lo, right_lo)); + emit_insn (gen_mul<vnsi>3_dup (tmp, left_hi, right_lo)); + emit_insn (gen_add<vnsi>3 (out_hi, out_hi, tmp)); + emit_insn (gen_mul<vnsi>3_dup (tmp, left_lo, right_hi)); + emit_insn (gen_add<vnsi>3 (out_hi, out_hi, tmp)); + emit_insn (gen_mul<vnsi>3_dup (tmp, left_hi, right_hi)); + emit_insn (gen_add<vnsi>3 (out_hi, out_hi, tmp)); + DONE; + }) + (define_insn_and_split "mul<mode>3_exec" [(set (match_operand:V_DI 0 "register_operand" "=&v") (vec_merge:V_DI @@ -2286,6 +2447,56 @@ DONE; }) +(define_insn_and_split "mul<mode>3_dup_exec" + [(set (match_operand:V_DI 0 "register_operand" "=&v") + (vec_merge:V_DI + (mult:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "gcn_alu_operand" " Sv")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDA")) + (match_operand:V_DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (match_scratch:<VnSI> 5 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (<MODE>mode, operands[0], 0); + rtx out_hi = gcn_operand_part (<MODE>mode, operands[0], 1); + rtx left_lo = gcn_operand_part (DImode, operands[1], 0); + rtx left_hi = gcn_operand_part (DImode, operands[1], 1); + rtx right_lo = gcn_operand_part (<MODE>mode, operands[2], 0); + rtx right_hi = gcn_operand_part (<MODE>mode, operands[2], 1); + rtx exec = operands[4]; + rtx tmp = operands[5]; + + rtx old_lo, old_hi; + if (GET_CODE (operands[3]) == UNSPEC) + { + old_lo = old_hi = gcn_gen_undef (<VnSI>mode); + } + else + { + old_lo = gcn_operand_part (<MODE>mode, operands[3], 0); + old_hi = gcn_operand_part (<MODE>mode, operands[3], 1); + } + + rtx undef = gcn_gen_undef (<VnSI>mode); + + emit_insn (gen_mul<vnsi>3_dup_exec (out_lo, left_lo, right_lo, old_lo, + exec)); + emit_insn (gen_umul<vnsi>3_highpart_dup_exec (out_hi, left_lo, right_lo, + old_hi, exec)); + emit_insn (gen_mul<vnsi>3_dup_exec (tmp, left_hi, right_lo, undef, exec)); + emit_insn (gen_add<vnsi>3_exec (out_hi, out_hi, tmp, out_hi, exec)); + emit_insn (gen_mul<vnsi>3_dup_exec (tmp, left_lo, right_hi, undef, exec)); + emit_insn (gen_add<vnsi>3_exec (out_hi, out_hi, tmp, out_hi, exec)); + emit_insn (gen_mul<vnsi>3_dup_exec (tmp, left_hi, right_hi, undef, exec)); + emit_insn (gen_add<vnsi>3_exec (out_hi, out_hi, tmp, out_hi, exec)); + DONE; + }) + (define_insn_and_split "mul<mode>3_zext" [(set (match_operand:V_DI 0 "register_operand" "=&v") (mult:V_DI @@ -3053,7 +3264,8 @@ "flag_unsafe_math_optimizations" "v_sqrt%i0\t%0, %1" [(set_attr "type" "vop1") - (set_attr "length" "8")]) + (set_attr "length" "8") + (set_attr "transop" "yes")]) (define_insn "sqrt<mode>2" [(set (match_operand:FP 0 "register_operand" "= v") @@ -3062,7 +3274,8 @@ "flag_unsafe_math_optimizations" "v_sqrt%i0\t%0, %1" [(set_attr "type" "vop1") - (set_attr "length" "8")]) + (set_attr "length" "8") + (set_attr "transop" "yes")]) ; These FP unops have f64, f32 and f16 versions. (define_int_iterator MATH_UNOP_1OR2REG @@ -3352,7 +3565,8 @@ "" "v_rcp%i0\t%0, %1" [(set_attr "type" "vop1") - (set_attr "length" "8")]) + (set_attr "length" "8") + (set_attr "transop" "yes")]) ;; v_div_scale takes a numerator (op2) and denominator (op1) and returns the ;; one that matches op3 adjusted for best results in reciprocal division. @@ -3724,6 +3938,7 @@ v_cmpx%E1\t%2, %3 v_cmpx%E1\t%2, %3" [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc") + (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx") (set_attr "length" "4,8,4,8,8,8,4,8") (set_attr "rdna" "*,*,no,no,*,*,yes,yes")]) @@ -3778,6 +3993,7 @@ v_cmpx%E1\t%2, %3 v_cmpx%E1\t%2, %3" [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc") + (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx") (set_attr "length" "4,8,4,8,8,8,4,8") (set_attr "rdna" "*,*,no,no,*,*,yes,yes")]) @@ -3836,6 +4052,7 @@ v_cmpx%E1\t%2, %3 v_cmpx%E1\t%2, %3" [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc") + (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx") (set_attr "length" "4,8,4,8,8,4,8") (set_attr "rdna" "*,*,no,no,*,yes,yes")]) @@ -3859,6 +4076,7 @@ v_cmpx%E1\t%2, %3 v_cmpx%E1\t%2, %3" [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc") + (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx") (set_attr "length" "4,8,4,8,8,4,8") (set_attr "rdna" "*,*,no,no,*,yes,yes")]) @@ -4049,6 +4267,32 @@ DONE; }) +(define_expand "mask_gather_load<mode><vndi>" + [(set:V_MOV (match_operand:V_MOV 0 "register_operand") + (unspec:V_MOV + [(match_operand:DI 1 "register_operand") + (match_operand:<VnDI> 2 "register_operand") + (match_operand 3 "immediate_operand") + (match_operand:SI 4 "gcn_alu_operand") + (match_operand:DI 5 "") + (match_operand:V_MOV 6 "maskload_else_operand")] + UNSPEC_GATHER))] + "" + { + rtx exec = force_reg (DImode, operands[5]); + + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1], + operands[2], operands[4], + INTVAL (operands[3]), exec); + + emit_insn (gen_gather<mode>_insn_1offset_exec (operands[0], addr, + const0_rtx, const0_rtx, + const0_rtx, + gcn_gen_undef (<MODE>mode), + exec)); + DONE; + }) + (define_expand "mask_scatter_store<mode><vnsi>" [(match_operand:DI 0 "register_operand") (match_operand:<VnSI> 1 "register_operand") @@ -4077,6 +4321,27 @@ DONE; }) +(define_expand "mask_scatter_store<mode><vndi>" + [(match_operand:DI 0 "register_operand") + (match_operand:<VnDI> 1 "register_operand") + (match_operand 2 "immediate_operand") + (match_operand:SI 3 "gcn_alu_operand") + (match_operand:V_MOV 4 "register_operand") + (match_operand:DI 5 "")] + "" + { + rtx exec = force_reg (DImode, operands[5]); + + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0], + operands[1], operands[3], + INTVAL (operands[2]), exec); + + emit_insn (gen_scatter<mode>_insn_1offset_exec (addr, const0_rtx, + operands[4], const0_rtx, + const0_rtx, exec)); + DONE; + }) + (define_code_iterator cond_op [plus minus mult]) (define_expand "cond_<expander><mode>" @@ -4397,7 +4662,7 @@ rtx tmp = gen_reg_rtx (<MODE>mode); rtx v1 = gen_rtx_REG (<MODE>mode, VGPR_REGNO (1)); - emit_insn (gen_mul<mode>3_dup (tmp, v1, operands[2])); + emit_insn (gen_mul<mode>3_dup (tmp, operands[2], v1)); emit_insn (gen_add<mode>3_dup (operands[0], tmp, operands[1])); DONE; }) diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 0ce5a29..5ffeb23 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -54,6 +54,7 @@ #include "gimple.h" #include "cgraph.h" #include "case-cfn-macros.h" +#include "opts.h" /* This file should be included last. */ #include "target-def.h" @@ -183,6 +184,11 @@ gcn_option_override (void) if (flag_sram_ecc == HSACO_ATTR_DEFAULT) flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default; + + /* TODO: This seems to produce tighter loops, but the testsuites expects it + to be set to '2', so I'll leave it default for now. + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_vect_partial_vector_usage, 1); */ } /* }}} */ @@ -1275,13 +1281,13 @@ gen_##PREFIX##vN##SUFFIX (PARAMS) \ } #define GEN_VNM_NOEXEC(PREFIX, SUFFIX, PARAMS, ARGS) \ -GEN_VN_NOEXEC (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN_NOEXEC (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN_NOEXEC (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS))) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS))) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS))) \ GEN_VN_NOEXEC (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN_NOEXEC (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS))) \ GEN_VN_NOEXEC (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN_NOEXEC (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS))) \ static rtx \ gen_##PREFIX##vNm##SUFFIX (PARAMS) \ { \ @@ -1289,13 +1295,13 @@ gen_##PREFIX##vNm##SUFFIX (PARAMS) \ \ switch (mode) \ { \ - case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS); \ - case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS); \ - case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS); \ + USE_QHF (case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS);) \ + USE_QHF (case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS);) \ + USE_QHF (case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS);) \ case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS); \ - case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS); \ + USE_QHF (case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS);) \ case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS); \ - case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS); \ + USE_QHF (case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS);) \ default: \ break; \ } \ @@ -1340,13 +1346,13 @@ gen_##PREFIX##vN##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \ } #define GEN_VNM(PREFIX, SUFFIX, PARAMS, ARGS) \ -GEN_VN (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS))) \ +USE_QHF (GEN_VN (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS))) \ +USE_QHF (GEN_VN (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS))) \ GEN_VN (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS))) \ GEN_VN (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS))) \ USE_TI (GEN_VN (PREFIX, ti##SUFFIX, A(PARAMS), A(ARGS))) \ static rtx \ gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \ @@ -1355,15 +1361,22 @@ gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \ \ switch (mode) \ { \ - case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS, merge_src, exec); \ - case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS, merge_src, exec); \ - case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS, merge_src, exec); \ - case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS, merge_src, exec); \ - case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS, merge_src, exec); \ - case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS, merge_src, exec); \ - case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS, merge_src, exec); \ - case E_TImode: \ - USE_TI (return gen_##PREFIX##vNti##SUFFIX (ARGS, merge_src, exec);) \ + USE_QHF (case E_QImode: \ + return gen_##PREFIX##vNqi##SUFFIX (ARGS, merge_src, exec);) \ + USE_QHF (case E_HImode: \ + return gen_##PREFIX##vNhi##SUFFIX (ARGS, merge_src, exec);) \ + USE_QHF (case E_HFmode: \ + return gen_##PREFIX##vNhf##SUFFIX (ARGS, merge_src, exec);) \ + case E_SImode: \ + return gen_##PREFIX##vNsi##SUFFIX (ARGS, merge_src, exec); \ + USE_QHF (case E_SFmode: \ + return gen_##PREFIX##vNsf##SUFFIX (ARGS, merge_src, exec);) \ + case E_DImode: \ + return gen_##PREFIX##vNdi##SUFFIX (ARGS, merge_src, exec); \ + USE_QHF (case E_DFmode: \ + return gen_##PREFIX##vNdf##SUFFIX (ARGS, merge_src, exec);) \ + USE_TI (case E_TImode: \ + return gen_##PREFIX##vNti##SUFFIX (ARGS, merge_src, exec);) \ default: \ break; \ } \ @@ -1372,7 +1385,8 @@ gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \ return NULL_RTX; \ } -/* These have TImode support. */ +/* These support everything. */ +#define USE_QHF(ARGS) ARGS #define USE_TI(ARGS) ARGS GEN_VNM (mov,, A(rtx dest, rtx src), A(dest, src)) GEN_VNM (vec_duplicate,, A(rtx dest, rtx src), A(dest, src)) @@ -1382,6 +1396,7 @@ GEN_VNM (vec_duplicate,, A(rtx dest, rtx src), A(dest, src)) #define USE_TI(ARGS) GEN_VNM (add,3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VN (add,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) +GEN_VN (add,di3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VN (add,si3_vcc_dup, A(rtx dest, rtx src1, rtx src2, rtx vcc), A(dest, src1, src2, vcc)) GEN_VN (add,di3_sext_dup2, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) @@ -1393,15 +1408,20 @@ GEN_VN (add,di3_vcc_zext_dup2, A(rtx dest, rtx src1, rtx src2, rtx vcc), GEN_VN (addc,si3, A(rtx dest, rtx src1, rtx src2, rtx vccout, rtx vccin), A(dest, src1, src2, vccout, vccin)) GEN_VN (and,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) -GEN_VN (ashl,si3, A(rtx dest, rtx src, rtx shift), A(dest, src, shift)) GEN_VNM_NOEXEC (ds_bpermute,, A(rtx dest, rtx addr, rtx src, rtx exec), A(dest, addr, src, exec)) GEN_VNM (gather,_expr, A(rtx dest, rtx addr, rtx as, rtx vol), A(dest, addr, as, vol)) -GEN_VN (mul,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VN (sub,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VN_NOEXEC (vec_series,si, A(rtx dest, rtx x, rtx c), A(dest, x, c)) +/* These do not have QI, HI, or any FP support. */ +#undef USE_QHF +#define USE_QHF(ARGS) +GEN_VNM (ashl,3, A(rtx dest, rtx src, rtx shift), A(dest, src, shift)) +GEN_VNM (mul,3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) + +#undef USE_QHF #undef USE_TI #undef GEN_VNM #undef GEN_VN @@ -1995,8 +2015,8 @@ gcn_expand_vector_init (rtx op0, rtx vec) rtx addr = gen_reg_rtx (addrmode); int unit_size = GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (op0))); - emit_insn (gen_mulvNsi3_dup (ramp, gen_rtx_REG (offsetmode, VGPR_REGNO (1)), - GEN_INT (unit_size))); + emit_insn (gen_mulvNsi3_dup (ramp, GEN_INT (unit_size), + gen_rtx_REG (offsetmode, VGPR_REGNO (1)))); bool simple_repeat = true; @@ -2293,36 +2313,46 @@ gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem, Return values. ADDR_SPACE_FLAT - return VnDImode vector of absolute addresses. - ADDR_SPACE_GLOBAL - return VnSImode vector of offsets. */ + ADDR_SPACE_GLOBAL - return VnSImode vector of offsets. + 64-bit offsets - return VnDImode vector of absolute addresses. */ rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale, bool unsigned_p, rtx exec) { int vf = GET_MODE_NUNITS (GET_MODE (offsets)); - rtx tmpsi = gen_reg_rtx (VnMODE (vf, SImode)); - rtx tmpdi = gen_reg_rtx (VnMODE (vf, DImode)); + rtx scaled_offsets = gen_reg_rtx (GET_MODE (offsets)); + rtx abs_addr = gen_reg_rtx (VnMODE (vf, DImode)); + bool use_di = GET_MODE_INNER (GET_MODE (scaled_offsets)) == DImode; if (CONST_INT_P (scale) && INTVAL (scale) > 0 && exact_log2 (INTVAL (scale)) >= 0) - emit_insn (gen_ashlvNsi3 (tmpsi, offsets, - GEN_INT (exact_log2 (INTVAL (scale))), - NULL, exec)); + emit_insn (gen_ashlvNm3 (scaled_offsets, offsets, + GEN_INT (exact_log2 (INTVAL (scale))), + NULL, exec)); else - emit_insn (gen_mulvNsi3_dup (tmpsi, offsets, scale, NULL, exec)); + emit_insn (gen_mulvNm3_dup (scaled_offsets, scale, offsets, NULL, exec)); + /* No instructions support DImode offsets. */ + if (use_di) + { + emit_insn (gen_addvNdi3_dup (abs_addr, base, scaled_offsets, NULL, exec)); + return abs_addr; + } /* "Global" instructions do not support negative register offsets. */ - if (as == ADDR_SPACE_FLAT || !unsigned_p) + else if (as == ADDR_SPACE_FLAT || !unsigned_p) { if (unsigned_p) - emit_insn (gen_addvNdi3_zext_dup2 (tmpdi, tmpsi, base, NULL, exec)); + emit_insn (gen_addvNdi3_zext_dup2 (abs_addr, scaled_offsets, base, + NULL, exec)); else - emit_insn (gen_addvNdi3_sext_dup2 (tmpdi, tmpsi, base, NULL, exec)); - return tmpdi; + emit_insn (gen_addvNdi3_sext_dup2 (abs_addr, scaled_offsets, base, + NULL, exec)); + return abs_addr; } else if (as == ADDR_SPACE_GLOBAL) - return tmpsi; + return scaled_offsets; gcc_unreachable (); } @@ -5315,8 +5345,12 @@ gcn_preferred_vector_alignment (const_tree type) static bool gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode), const_tree type, int misalignment, - bool is_packed) + bool is_packed, + bool is_gather_scatter) { + if (is_gather_scatter) + return true; + if (is_packed) return false; @@ -5761,6 +5795,16 @@ gcn_libc_has_function (enum function_class fn_class, return bsd_libc_has_function (fn_class, type); } +/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */ + +static bool +gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode), + int ARG_UNUSED (scale), + unsigned int ARG_UNUSED (group_size)) +{ + return true; +} + /* }}} */ /* {{{ md_reorg pass. */ @@ -6124,12 +6168,22 @@ gcn_md_reorg (void) detects the missed cases, and inserts the documented number of NOPs required for correct execution. */ + /* RDNA4 (not yet implemented) differs from RNDA 2/3/3.5 and requires some + s_nop, see 5.7 and esp. 5.7.2. in its ISA manual. + The assert here is a reminder to add those. */ + STATIC_ASSERT (ISA_CDNA1 - ISA_RDNA3 == 1); + + if (TARGET_NO_MANUAL_NOPS) + return; + const int max_waits = 5; struct ilist { rtx_insn *insn; attr_unit unit; - attr_delayeduse delayeduse; + attr_type type; + attr_flatmemaccess flatmemaccess; + bool delayeduse; HARD_REG_SET writes; HARD_REG_SET reads; int age; @@ -6150,7 +6204,29 @@ gcn_md_reorg (void) attr_type itype = get_attr_type (insn); attr_unit iunit = get_attr_unit (insn); - attr_delayeduse idelayeduse = get_attr_delayeduse (insn); + attr_flatmemaccess iflatmemaccess = get_attr_flatmemaccess (insn); + bool delayeduse; + if (TARGET_CDNA3_NOPS) + switch (iflatmemaccess) + { + case FLATMEMACCESS_STORE: + case FLATMEMACCESS_STOREX34: + case FLATMEMACCESS_ATOMIC: + case FLATMEMACCESS_CMPSWAPX2: + delayeduse = true; + break; + case FLATMEMACCESS_LOAD: + case FLATMEMACCESS_ATOMICWAIT: + case FLATMEMACCESS_NO: + delayeduse = false; + break; + default: + gcc_unreachable (); + } + else + delayeduse = (iflatmemaccess == FLATMEMACCESS_CMPSWAPX2 + || iflatmemaccess == FLATMEMACCESS_STOREX34); + int ivccwait = get_attr_vccwait (insn); HARD_REG_SET ireads, iwrites; CLEAR_HARD_REG_SET (ireads); @@ -6195,16 +6271,26 @@ gcn_md_reorg (void) && TEST_HARD_REG_BIT (ireads, VCCZ_REG)))) nops_rqd = 5 - prev_insn->age; - /* VALU writes SGPR/VCC followed by v_{read,write}lane using - SGPR/VCC as lane select requires 4 wait states. */ + /* VALU writes SGPR/VCC followed by + - v_{read,write}lane using SGPR/VCC as lane select requires + 4 wait states + - [CDNA3] VALU reads SGPR as constant requires 1 wait state + - [CDNA3] VALU reads SGPR as carry-in requires no wait states */ if ((prev_insn->age + nops_rqd) < 4 && prev_insn->unit == UNIT_VECTOR - && get_attr_laneselect (insn) == LANESELECT_YES + && get_attr_laneselect (insn) != LANESELECT_NO && (hard_reg_set_intersect_p (depregs, reg_class_contents[(int) SGPR_REGS]) || hard_reg_set_intersect_p (depregs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))) nops_rqd = 4 - prev_insn->age; + else if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && prev_insn->unit == UNIT_VECTOR + && iunit == UNIT_VECTOR + && hard_reg_set_intersect_p + (depregs, reg_class_contents[(int) SGPR_REGS])) + nops_rqd = 1 - prev_insn->age; /* VALU writes VGPR followed by VALU_DPP reading that VGPR requires 2 wait states. */ @@ -6217,22 +6303,128 @@ gcn_md_reorg (void) nops_rqd = 2 - prev_insn->age; } + /* VALU writes EXEC followed by VALU DPP op requires 5 nop. */ + if ((prev_insn->age + nops_rqd) < 5 + && itype == TYPE_VOP_DPP + && prev_insn->unit == UNIT_VECTOR + && TEST_HARD_REG_BIT (prev_insn->writes, EXECZ_REG)) + nops_rqd = 5 - prev_insn->age; + /* Store that requires input registers are not overwritten by - following instruction. */ - if ((prev_insn->age + nops_rqd) < 1 - && prev_insn->delayeduse == DELAYEDUSE_YES + following instruction. + For CDNA3, only, VALU writes require 2 not 1 nop. + CDNA3 additionally requires that 1 or 2 nop for global & scatch + store/atomic. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 2 + && prev_insn->delayeduse + && iunit == UNIT_VECTOR + && ((hard_reg_set_intersect_p + (prev_insn->reads, iwrites)))) + nops_rqd = 2 - prev_insn->age; + else if ((prev_insn->age + nops_rqd) < 1 + && prev_insn->delayeduse && ((hard_reg_set_intersect_p (prev_insn->reads, iwrites)))) nops_rqd = 1 - prev_insn->age; - /* Instruction that requires VCC is not written too close before - using it. */ + /* Instruction (such as v_div_fmas) that requires VCC is not written + too close before using it */ if (prev_insn->age < ivccwait && (hard_reg_set_intersect_p (prev_insn->writes, reg_class_contents[(int)VCC_CONDITIONAL_REG]))) nops_rqd = ivccwait - prev_insn->age; + /* NOTE: The following condition for adding wait state exists, but + GCC does not access the special registers using their SGPR#. + Thus, no action is required here. The following wait-state + condition exists at least for VEGA/gfx900+ to CDNA3: + Mixed use of VCC: alias vs. SGPR# - v_readlane, + v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale + followed by VALU reads VCC as constant requires 1 wait state. + (As carry-in, it requires none.) + [VCC can be accessed by name or logical SGPR that holds it.] */ + + /* Testing indicates that CDNA3 requires an s_nop between + e.g. 'v_cmp_eq_u64 vcc, v[4:5], v[8:9]' and 'v_mov_b32 v0, vcc_lo'. + Thus: add it between v_cmp writing VCC and VALU read of VCC. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && iunit == UNIT_VECTOR + && (hard_reg_set_intersect_p + (depregs, reg_class_contents[(int)VCC_CONDITIONAL_REG])) + && get_attr_vcmp (prev_insn->insn) == VCMP_VCMP) + nops_rqd = 1 - prev_insn->age; + + /* CDNA3: VALU writes SGPR/VCC: v_readlane, v_readfirstlane, v_cmp, + v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by: + - VALU reads SGPR as constant requires 1 waite state + - VALU reads SGPR as carry-in requires no waite state + - v_readlane/v_writelane reads SGPR as lane select requires 4 wait + states. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 4 + && iunit == UNIT_VECTOR + && prev_insn->unit == UNIT_VECTOR + && hard_reg_set_intersect_p + (depregs, reg_class_contents[(int) SGPR_SRC_REGS])) + { + if (get_attr_laneselect (insn) != LANESELECT_NO) + nops_rqd = 4 - prev_insn->age; + else if ((prev_insn->age + nops_rqd) < 1) + nops_rqd = 1 - prev_insn->age; + } + + /* CDNA3: v_cmpx followed by + - V_readlane, v_readfirstlane, v_writelane requires 4 wait states + - VALU reads EXEC as constant requires 2 wait states + - other VALU requires no wait state */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 4 + && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX + && get_attr_laneselect (insn) != LANESELECT_NO) + nops_rqd = 4 - prev_insn->age; + else if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 2 + && iunit == UNIT_VECTOR + && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX + && TEST_HARD_REG_BIT (ireads, EXECZ_REG)) + nops_rqd = 2 - prev_insn->age; + + /* CDNA3: VALU writes VGPR followed by v_readlane vsrc0 reads VGPRn + requires 1 wait state. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && prev_insn->unit == UNIT_VECTOR + && prev_insn->flatmemaccess != FLATMEMACCESS_LOAD + && get_attr_laneselect (insn) == LANESELECT_READ + && hard_reg_set_intersect_p + (depregs, reg_class_contents[(int) VGPR_REGS])) + nops_rqd = 1 - prev_insn->age; + + /* CDNA3: VALU op which uses OPSEL or SDWA with changes the result's + bit position followed by VALU op consumes result of that op + requires 1 wait state. + FIXME: Handle OPSEL, once used. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && prev_insn->unit == UNIT_VECTOR + && prev_insn->type == TYPE_VOP_SDWA + && !hard_reg_set_empty_p (depregs)) + nops_rqd = 1 - prev_insn->age; + + /* CNDA3: VALU Trans Op (such as v_rcp_f64) followed by non-trans VALU + op consumes result of that op requires 1 wait state. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && prev_insn->unit == UNIT_VECTOR + && iunit == UNIT_VECTOR + && get_attr_transop (prev_insn->insn) == TRANSOP_YES + && get_attr_transop (insn) == TRANSOP_NO + && !hard_reg_set_empty_p (depregs)) + nops_rqd = 1 - prev_insn->age; + /* CDNA1: write VGPR before v_accvgpr_write reads it. */ if (TARGET_AVGPR_CDNA1_NOPS && (prev_insn->age + nops_rqd) < 2 @@ -6264,8 +6456,8 @@ gcn_md_reorg (void) } /* Insert the required number of NOPs. */ - for (int i = nops_rqd; i > 0; i--) - emit_insn_after (gen_nop (), last_insn); + if (nops_rqd > 0) + emit_insn_after (gen_nops (GEN_INT (nops_rqd-1)), last_insn); /* Age the previous instructions. We can also ignore writes to registers subsequently overwritten. */ @@ -6288,7 +6480,9 @@ gcn_md_reorg (void) /* Track the current instruction as a previous instruction. */ back[oldest].insn = insn; back[oldest].unit = iunit; - back[oldest].delayeduse = idelayeduse; + back[oldest].type = itype; + back[oldest].flatmemaccess = iflatmemaccess; + back[oldest].delayeduse = delayeduse; back[oldest].writes = iwrites; back[oldest].reads = ireads; back[oldest].age = 0; @@ -7109,6 +7303,11 @@ print_operand_address (FILE *file, rtx mem) H - print second part of a multi-reg value (high-part of 2-reg value) J - print third part of a multi-reg value K - print fourth part of a multi-reg value + R Print a scalar register number as an integer. Temporary hack. + V - Print a vector register number as an integer. Temporary hack. + + Additionally, the standard builtin c, n, a, and l exist; see gccint's + "Output Templates and Operand Substitution" for details. */ void @@ -7957,6 +8156,8 @@ gcn_dwarf_register_span (rtx rtl) gcn_vectorize_builtin_vectorized_function #undef TARGET_VECTORIZE_GET_MASK_MODE #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode +#undef TARGET_VECTORIZE_PREFER_GATHER_SCATTER +#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 9193461..4130cf6 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -312,18 +312,33 @@ ; We need to be able to identify v_readlane and v_writelane with ; SGPR lane selection in order to handle "Manually Inserted Wait States". -(define_attr "laneselect" "yes,no" (const_string "no")) +(define_attr "laneselect" "write,read,no" (const_string "no")) -; Identify instructions that require a "Manually Inserted Wait State" if -; their inputs are overwritten by subsequent instructions. +; Global or flat memory access using store or load followed by waitcnt +; and using flat/global atomic access, possibly followed by a waitcnt. +; 'storex34' denotes FLAT_STORE_X{3,4}. +; 'cmpswapx2' denotes FLAT_ATOMIC_{F}CMPSWAP_X2 +; Used to handle "Manually Inserted Wait State". -(define_attr "delayeduse" "yes,no" (const_string "no")) +(define_attr "flatmemaccess" + "store,storex34,load,atomic,atomicwait,cmpswapx2,no" + (const_string "no")) + +; Identify v_cmp and v_cmpx instructions for "Manually Inserted Wait State" +; handling. + +(define_attr "vcmp" "vcmp,vcmpx,no" (const_string "no")) ; Identify instructions that require "Manually Inserted Wait State" if ; a previous instruction writes to VCC. The number gives the number of NOPs. (define_attr "vccwait" "" (const_int 0)) +; Mark trans ops such as v_{exp,rsq,sqrt,sin,cos,log,...}_F{16,32,64} +; for later conditional s_nop insertion. + +(define_attr "transop" "yes,no" (const_string "no")) + ;; }}} ;; {{{ Iterators useful across the wole machine description @@ -414,6 +429,15 @@ "s_nop\t0x0" [(set_attr "type" "sopp")]) +; Variant of 'nop' that accepts a count argument. +; s_nop accepts 0x0 to 0xf for 1 to 16 nops; however, +; as %0 prints decimals, only 0 to 9 (= 1 to 10 nops) can be used. +(define_insn "nops" + [(match_operand 0 "const_int_operand")] + "" + "s_nop\t0x%0" + [(set_attr "type" "sopp")]) + ; FIXME: What should the value of the immediate be? Zero is disallowed, so ; pick 1 for now. (define_insn "trap" @@ -555,9 +579,12 @@ } [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat, flat,flat,flat,flat") + (set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store") + (set_attr "vcmp" "*,*,*,*,vcmp,*,*,*,*,*,*,*,*,*,*") (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*") (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12") - (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")]) + (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*") + (set_attr "laneselect" "*,*,read,*,*,*,*,*,*,*,*,*,*,*,*")]) ; 32bit move pattern @@ -565,38 +592,38 @@ [(set (match_operand:SISF 0 "nonimmediate_operand") (match_operand:SISF 1 "gcn_load_operand"))] "" - {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack] - [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1 - [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1 - [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1 - [SD ,RB ;smem ,* ,12,* ,off] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0) - [&SD ,RB ;smem ,* ,12,* ,on ] ^ - [RB ,Sm ;smem ,* ,12,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0 - [Sm ,RS ;smem ,* ,12,* ,off] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0) - [&Sm ,RS ;smem ,* ,12,* ,on ] ^ - [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dword\t%1, %A0 - [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1 - [Sg ,v ;vop3a,none,8 ,* ,* ] v_readlane_b32\t%0, %1, 0 - [v ,Sv ;vop3a,none,8 ,* ,* ] v_writelane_b32\t%0, %1, 0 - [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1 - [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1 - [a ,a ;vop1 ,* ,4,cdna2,* ] v_accvgpr_mov_b32\t%0, %1 - [v ,RF ;flat ,* ,12,* ,off] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0 - [&v ,RF ;flat ,* ,12,* ,on ] ^ - [^a ,RF ;flat ,* ,12,cdna2,off] ^ - [&^a ,RF ;flat ,* ,12,cdna2,on ] ^ - [RF ,v ;flat ,* ,12,* ,* ] flat_store_dword\t%A0, %1%O0%g0 - [RF ,a ;flat ,* ,12,cdna2,* ] ^ - [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1 - [RLRG,v ;ds ,* ,12,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) - [v ,RLRG;ds ,* ,12,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) - [SD ,Y ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1 - [v ,RM ;flat ,* ,12,* ,off] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - [&v ,RM ;flat ,* ,12,* ,on ] ^ - [^a ,RM ;flat ,* ,12,cdna2,off] ^ - [&^a ,RM ;flat ,* ,12,cdna2,on ] ^ - [RM ,v ;flat ,* ,12,* ,* ] global_store_dword\t%A0, %1%O0%g0 - [RM ,a ;flat ,* ,12,cdna2,* ] ^ + {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess] + [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1 + [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [SD ,RB ;smem ,* ,12,* ,off,* ,* ] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0) + [&SD ,RB ;smem ,* ,12,* ,on ,* ,* ] ^ + [RB ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0 + [Sm ,RS ;smem ,* ,12,* ,off,* ,* ] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0) + [&Sm ,RS ;smem ,* ,12,* ,on ,* ,* ] ^ + [RS ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_store_dword\t%1, %A0 + [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1 + [Sg ,v ;vop3a,none,8 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0 + [v ,Sv ;vop3a,none,8 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0 + [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1 + [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1 + [a ,a ;vop1 ,* ,4,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1 + [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0 + [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^ + [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^ + [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^ + [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store_dword\t%A0, %1%O0%g0 + [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^ + [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1 + [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) + [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + [SD ,Y ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^ + [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^ + [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^ + [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store_dword\t%A0, %1%O0%g0 + [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^ }) ; 8/16bit move pattern @@ -606,31 +633,31 @@ [(set (match_operand:QIHI 0 "nonimmediate_operand") (match_operand:QIHI 1 "gcn_load_operand"))] "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])" - {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack] - [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1 - [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1 - [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1 - [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1 - [Sg ,v ;vop3a,none,4 ,* ,* ] v_readlane_b32\t%0, %1, 0 - [v ,Sv ;vop3a,none,4 ,* ,* ] v_writelane_b32\t%0, %1, 0 - [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1 - [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1 - [a ,a ;vop1 ,* ,8,cdna2,* ] v_accvgpr_mov_b32\t%0, %1 - [v ,RF ;flat ,* ,12,* ,off] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0 - [&v ,RF ;flat ,* ,12,* ,on ] ^ - [^a ,RF ;flat ,* ,12,cdna2,off] ^ - [&^a ,RF ;flat ,* ,12,cdna2,on ] ^ - [RF ,v ;flat ,* ,12,* ,* ] flat_store%s0\t%A0, %1%O0%g0 - [RF ,a ;flat ,* ,12,cdna2,* ] ^ - [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1 - [RLRG,v ;ds ,* ,12,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) - [v ,RLRG;ds ,* ,12,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) - [v ,RM ;flat ,* ,12,* ,off] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - [&v ,RM ;flat ,* ,12,* ,on ] ^ - [^a ,RM ;flat ,* ,12,cdna2,off] ^ - [&^a ,RM ;flat ,* ,12,cdna2,on ] ^ - [RM ,v ;flat ,* ,12,* ,* ] global_store%s0\t%A0, %1%O0%g0 - [RM ,a ;flat ,* ,12,cdna2,* ] ^ + {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess] + [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1 + [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1 + [Sg ,v ;vop3a,none,4 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0 + [v ,Sv ;vop3a,none,4 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0 + [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1 + [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1 + [a ,a ;vop1 ,* ,8,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1 + [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0 + [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^ + [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^ + [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^ + [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store%s0\t%A0, %1%O0%g0 + [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^ + [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1 + [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) + [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^ + [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^ + [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^ + [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store%s0\t%A0, %1%O0%g0 + [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^ }) ; 64bit move pattern @@ -639,34 +666,34 @@ [(set (match_operand:DIDF 0 "nonimmediate_operand") (match_operand:DIDF 1 "general_operand"))] "GET_CODE(operands[1]) != SYMBOL_REF" - {@ [cons: =0, 1; attrs: type, length, cdna, xnack] - [SD ,SSA ;sop1 ,4 ,* ,* ] s_mov_b64\t%0, %1 - [SD ,C ;sop1 ,8 ,* ,* ] ^ - [SD ,DB ;mult ,* ,* ,* ] # - [RS ,Sm ;smem ,12,* ,* ] s_store_dwordx2\t%1, %A0 - [Sm ,RS ;smem ,12,* ,off] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0) - [&Sm ,RS ;smem ,12,* ,on ] ^ - [v ,v ;vmult,* ,* ,* ] # - [v ,DB ;vmult,* ,* ,* ] # - [Sg ,v ;vmult,* ,* ,* ] # - [v ,Sv ;vmult,* ,* ,* ] # - [v ,^a ;vmult,* ,* ,* ] # - [a ,v ;vmult,* ,* ,* ] # - [a ,a ;vmult,* ,cdna2,* ] # - [v ,RF ;flat ,12,* ,off] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0 - [&v ,RF ;flat ,12,* ,on ] ^ - [^a ,RF ;flat ,12,cdna2,off] ^ - [&^a ,RF ;flat ,12,cdna2,on ] ^ - [RF ,v ;flat ,12,* ,* ] flat_store_dwordx2\t%A0, %1%O0%g0 - [RF ,a ;flat ,12,cdna2,* ] ^ - [RLRG,v ;ds ,12,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) - [v ,RLRG;ds ,12,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) - [v ,RM ;flat ,12,* ,off] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - [&v ,RM ;flat ,12,* ,on ] ^ - [^a ,RM ;flat ,12,cdna2,off] ^ - [&^a ,RM ;flat ,12,cdna2,on ] ^ - [RM ,v ;flat ,12,* ,* ] global_store_dwordx2\t%A0, %1%O0%g0 - [RM ,a ;flat ,12,cdna2,* ] ^ + {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess] + [SD ,SSA ;sop1 ,4 ,* ,* ,* ] s_mov_b64\t%0, %1 + [SD ,C ;sop1 ,8 ,* ,* ,* ] ^ + [SD ,DB ;mult ,* ,* ,* ,* ] # + [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx2\t%1, %A0 + [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0) + [&Sm ,RS ;smem ,12,* ,on ,* ] ^ + [v ,v ;vmult,* ,* ,* ,* ] # + [v ,DB ;vmult,* ,* ,* ,* ] # + [Sg ,v ;vmult,* ,* ,* ,* ] # + [v ,Sv ;vmult,* ,* ,* ,* ] # + [v ,^a ;vmult,* ,* ,* ,* ] # + [a ,v ;vmult,* ,* ,* ,* ] # + [a ,a ;vmult,* ,cdna2,* ,* ] # + [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0 + [&v ,RF ;flat ,12,* ,on ,load ] ^ + [^a ,RF ;flat ,12,cdna2,off,load ] ^ + [&^a ,RF ;flat ,12,cdna2,on ,load ] ^ + [RF ,v ;flat ,12,* ,* ,store] flat_store_dwordx2\t%A0, %1%O0%g0 + [RF ,a ;flat ,12,cdna2,* ,store] ^ + [RLRG,v ;ds ,12,* ,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) + [v ,RLRG;ds ,12,* ,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + [&v ,RM ;flat ,12,* ,on ,load ] ^ + [^a ,RM ;flat ,12,cdna2,off,load ] ^ + [&^a ,RM ;flat ,12,cdna2,on ,load ] ^ + [RM ,v ;flat ,12,* ,* ,store] global_store_dwordx2\t%A0, %1%O0%g0 + [RM ,a ;flat ,12,cdna2,* ,store] ^ } "reload_completed && ((!MEM_P (operands[0]) && !MEM_P (operands[1]) @@ -704,31 +731,31 @@ [(set (match_operand:TI 0 "nonimmediate_operand") (match_operand:TI 1 "general_operand" ))] "" - {@ [cons: =0, 1; attrs: type, delayeduse, length, cdna, xnack] - [SD ,SSB;mult ,* ,* ,* ,* ] # - [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dwordx4\t%1, %A0 - [Sm ,RS ;smem ,yes,12,* ,off] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0) - [&Sm,RS ;smem ,yes,12,* ,on ] ^ - [RF ,v ;flat ,* ,12,* ,* ] flat_store_dwordx4\t%A0, %1%O0%g0 - [RF ,a ;flat ,* ,12,cdna2,* ] ^ - [v ,RF ;flat ,* ,12,* ,off] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0 - [&v ,RF ;flat ,* ,12,* ,on ] ^ - [^a ,RF ;flat ,* ,12,cdna2,off] ^ - [&^a,RF ;flat ,* ,12,cdna2,on ] ^ - [v ,v ;vmult,* ,* ,* ,* ] # - [v ,Sv ;vmult,* ,* ,* ,* ] # - [SD ,v ;vmult,* ,* ,* ,* ] # - [RM ,v ;flat ,yes,12,* ,* ] global_store_dwordx4\t%A0, %1%O0%g0 - [RM ,a ;flat ,yes,12,cdna2,* ] ^ - [v ,RM ;flat ,* ,12,* ,off] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - [&v ,RM ;flat ,* ,12,* ,on ] ^ - [^a ,RM ;flat ,* ,12,cdna2,off] ^ - [&^a,RM ;flat ,* ,12,cdna2,on ] ^ - [RL ,v ;ds ,* ,12,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) - [v ,RL ;ds ,* ,12,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) - [v ,^a ;vmult,* ,* ,* ,* ] # - [a ,v ;vmult,* ,* ,* ,* ] # - [a ,a ;vmult,* ,* ,cdna2,* ] # + {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess] + [SD ,SSB;mult ,* ,* ,* ,* ] # + [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx4\t%1, %A0 + [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0) + [&Sm,RS ;smem ,12,* ,on ,* ] ^ + [RF ,v ;flat ,12,* ,* ,storex34] flat_store_dwordx4\t%A0, %1%O0%g0 + [RF ,a ;flat ,12,cdna2,* ,storex34] ^ + [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0 + [&v ,RF ;flat ,12,* ,on ,load ] ^ + [^a ,RF ;flat ,12,cdna2,off,load ] ^ + [&^a,RF ;flat ,12,cdna2,on ,load ] ^ + [v ,v ;vmult,* ,* ,* ,* ] # + [v ,Sv ;vmult,* ,* ,* ,* ] # + [SD ,v ;vmult,* ,* ,* ,* ] # + [RM ,v ;flat ,12,* ,* ,storex34] global_store_dwordx4\t%A0, %1%O0%g0 + [RM ,a ;flat ,12,cdna2,* ,storex34] ^ + [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + [&v ,RM ;flat ,12,* ,on ,load ] ^ + [^a ,RM ;flat ,12,cdna2,off,load ] ^ + [&^a,RM ;flat ,12,cdna2,on ,load ] ^ + [RL ,v ;ds ,12,* ,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) + [v ,RL ;ds ,12,* ,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + [v ,^a ;vmult,* ,* ,* ,* ] # + [a ,v ;vmult,* ,* ,* ,* ] # + [a ,a ;vmult,* ,cdna2,* ,* ] # } "reload_completed && REG_P (operands[0]) @@ -1077,6 +1104,7 @@ s_cmp%D1\t%2, %3 v_cmp%E1\tvcc, %2, %3" [(set_attr "type" "sopc,vopc") + (set_attr "vcmp" "vcmp") (set_attr "length" "8")]) (define_insn "cstoredi4_vector" @@ -1087,6 +1115,7 @@ "" "v_cmp%E1\tvcc, %2, %3" [(set_attr "type" "vopc") + (set_attr "vcmp" "vcmp") (set_attr "length" "8")]) (define_expand "cbranchdi4" @@ -1113,6 +1142,7 @@ "" "v_cmp%E1\tvcc, %2, %3" [(set_attr "type" "vopc") + (set_attr "vcmp" "vcmp") (set_attr "length" "8")]) (define_expand "cbranch<mode>4" @@ -1985,6 +2015,7 @@ flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 %G2\;s_waitcnt\t0 global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,atomicwait,atomicwait") (set_attr "length" "12")]) ; FIXME: These patterns are disabled because the instructions don't @@ -2006,6 +2037,7 @@ flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0 global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,atomicwait,atomicwait") (set_attr "length" "12")]) (define_mode_attr x2 [(SI "DI") (DI "TI")]) @@ -2053,7 +2085,7 @@ global_atomic_cmpswap<X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") (set_attr "length" "12") - (set_attr "delayeduse" "*,yes,yes")]) + (set_attr "flatmemaccess" "*,cmpswapx2,cmpswapx2")]) (define_insn "sync_compare_and_swap<mode>_lds_insn" [(set (match_operand:SIDI 0 "register_operand" "= v") @@ -2151,7 +2183,7 @@ ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_load%o0\t%0, %A1%O1 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\t0\;buffer_inv sc1" : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\t0\;buffer_wbinvl1_vol"); @@ -2163,7 +2195,7 @@ ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_inv sc1" : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"); @@ -2173,6 +2205,7 @@ gcc_unreachable (); } [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,load,load") (set_attr "length" "28") (set_attr "rdna" "no,*,*")]) @@ -2209,7 +2242,7 @@ : TARGET_WBINVL1_CACHE ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_store%o1\t%A0, %1%O0 %G1" : "error: cache architectire unspecified"); case 2: return (TARGET_GLn_CACHE @@ -2217,7 +2250,7 @@ : TARGET_WBINVL1_CACHE ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_store%o1\t%A0, %1%O0 %G1" : "error: cache architecture unspecified"); } break; @@ -2237,7 +2270,8 @@ ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\t0\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;" + "flat_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\t0\;buffer_inv sc1" : "error: cache architecture unspecified"); case 2: @@ -2248,7 +2282,8 @@ ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;" + "global_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_inv sc1" : "error: cache architecture unspecified"); } @@ -2257,6 +2292,7 @@ gcc_unreachable (); } [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,store,store") (set_attr "length" "28") (set_attr "rdna" "no,*,*")]) @@ -2331,7 +2367,7 @@ ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0" : "error: cache architecture unspecified"); case 2: @@ -2344,7 +2380,7 @@ "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;" "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)" : "error: cache architecture unspecified"); @@ -2366,7 +2402,7 @@ ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0\;buffer_inv sc1" : "error: cache architecture unspecified"); case 2: @@ -2379,7 +2415,7 @@ "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;" "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_inv sc1" : "error: cache architecture unspecified"); @@ -2389,6 +2425,7 @@ gcc_unreachable (); } [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,atomicwait,atomicwait") (set_attr "length" "28") (set_attr "rdna" "no,*,*")]) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index c131577..53e86c8 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3226,7 +3226,7 @@ remove_partial_avx_dependency (void) break; } - /* Only hanlde conversion here. */ + /* Only handle conversion here. */ machine_mode src_mode = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode; switch (src_mode) diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def index 2fedbeb..c2db305 100644 --- a/gcc/config/i386/i386-modes.def +++ b/gcc/config/i386/i386-modes.def @@ -91,7 +91,6 @@ VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */ VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */ VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */ -VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */ VECTOR_MODE (FLOAT, HF, 2); /* V2HF */ VECTOR_MODE (FLOAT, BF, 2); /* V2BF */ VECTOR_MODE (FLOAT, HF, 6); /* V6HF */ @@ -102,7 +101,6 @@ VECTOR_MODE (INT, QI, 2); /* V2QI */ VECTOR_MODE (INT, QI, 12); /* V12QI */ VECTOR_MODE (INT, QI, 14); /* V14QI */ VECTOR_MODE (INT, HI, 6); /* V6HI */ -VECTOR_MODE (INT, SI, 64); /* V64SI */ INT_MODE (OI, 32); INT_MODE (XI, 64); diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index d244b225..09a35ef 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -1362,7 +1362,9 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET); if (arg_ok) set_option (opts, enum_opts_set, opt, value, - p + opt_len, DK_UNSPECIFIED, input_location, + p + opt_len, + static_cast<int> (diagnostics::kind::unspecified), + input_location, global_dc); else { @@ -3613,6 +3615,18 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, return NULL_TREE; } + if (TARGET_64BIT) + { + /* Do not warn when emulating the MS ABI. */ + if ((TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE) + || ix86_function_type_abi (*node) != MS_ABI) + warning (OPT_Wattributes, "%qE attribute ignored", + name); + *no_add_attrs = true; + return NULL_TREE; + } + /* Can combine regparm with all attributes but fastcall, and thiscall. */ if (is_attribute_p ("regparm", name)) { @@ -3625,7 +3639,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) { - error ("regparam and thiscall attributes are not compatible"); + error ("regparm and thiscall attributes are not compatible"); } cst = TREE_VALUE (args); @@ -3646,19 +3660,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, return NULL_TREE; } - if (TARGET_64BIT) - { - /* Do not warn when emulating the MS ABI. */ - if ((TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE) - || ix86_function_type_abi (*node) != MS_ABI) - warning (OPT_Wattributes, "%qE attribute ignored", - name); - *no_add_attrs = true; - return NULL_TREE; - } - - /* Can combine fastcall with stdcall (redundant) and sseregparm. */ + /* Can combine fastcall with sseregparm. */ if (is_attribute_p ("fastcall", name)) { if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) @@ -3679,8 +3681,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, } } - /* Can combine stdcall with fastcall (redundant), regparm and - sseregparm. */ + /* Can combine stdcall with regparm and sseregparm. */ else if (is_attribute_p ("stdcall", name)) { if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) @@ -3730,6 +3731,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, { error ("cdecl and thiscall attributes are not compatible"); } + if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node))) + { + error ("regparm and thiscall attributes are not compatible"); + } } /* Can combine sseregparm with all attributes. */ diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 49bd393..65e04d3 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -12442,6 +12442,28 @@ static GTY(()) rtx ix86_tls_symbol; static rtx ix86_tls_get_addr (void) { + if (cfun->machine->call_saved_registers + == TYPE_NO_CALLER_SAVED_REGISTERS) + { + /* __tls_get_addr doesn't preserve vector registers. When a + function with no_caller_saved_registers attribute calls + __tls_get_addr, YMM and ZMM registers will be clobbered. + Issue an error and suggest -mtls-dialect=gnu2 in this case. */ + if (cfun->machine->func_type == TYPE_NORMAL) + error (G_("%<-mtls-dialect=gnu2%> must be used with a function" + " with the %<no_caller_saved_registers%> attribute")); + else + error (cfun->machine->func_type == TYPE_EXCEPTION + ? G_("%<-mtls-dialect=gnu2%> must be used with an" + " exception service routine") + : G_("%<-mtls-dialect=gnu2%> must be used with an" + " interrupt service routine")); + /* Don't issue the same error twice. */ + cfun->machine->func_type = TYPE_NORMAL; + cfun->machine->call_saved_registers + = TYPE_DEFAULT_CALL_SAVED_REGISTERS; + } + if (!ix86_tls_symbol) { const char *sym @@ -20007,7 +20029,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) tree utype, ures, vce; utype = unsigned_type_for (TREE_TYPE (arg0)); /* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR - instead of ABS_EXPR to hanlde overflow case(TYPE_MIN). */ + instead of ABS_EXPR to handle overflow case(TYPE_MIN). */ ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0); gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); loc = gimple_location (stmt); @@ -21491,8 +21513,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode) /* Register pair for mask registers. */ if (mode == P2QImode || mode == P2HImode) return 2; - if (mode == V64SFmode || mode == V64SImode) - return 4; + return 1; } @@ -23132,7 +23153,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, So current solution is make constant disp as cheap as possible. */ if (GET_CODE (addr) == PLUS && x86_64_immediate_operand (XEXP (addr, 1), Pmode) - /* Only hanlde (reg + disp) since other forms of addr are mostly LEA, + /* Only handle (reg + disp) since other forms of addr are mostly LEA, there's no additional cost for the plus of disp. */ && register_operand (XEXP (addr, 0), Pmode)) { @@ -24788,6 +24809,12 @@ static void map_egpr_constraints (vec<const char *> &constraints) buf.safe_push (cur[j + 1]); j++; break; + case '{': + do + { + buf.safe_push (cur[j]); + } while (cur[j++] != '}'); + break; default: buf.safe_push (cur[j]); break; @@ -25205,20 +25232,14 @@ asm_preferred_eh_data_format (int code, int global) return DW_EH_PE_absptr; } -/* Implement targetm.vectorize.builtin_vectorization_cost. */ +/* Worker for ix86_builtin_vectorization_cost and the fallback calls + from ix86_vector_costs::add_stmt_cost. */ static int -ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, - tree vectype, int) +ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost, + machine_mode mode) { - bool fp = false; - machine_mode mode = TImode; + bool fp = FLOAT_MODE_P (mode); int index; - if (vectype != NULL) - { - fp = FLOAT_TYPE_P (vectype); - mode = TYPE_MODE (vectype); - } - switch (type_of_cost) { case scalar_stmt: @@ -25277,14 +25298,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, COSTS_N_INSNS (ix86_cost->gather_static + ix86_cost->gather_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + * GET_MODE_NUNITS (mode)) / 2); case vector_scatter_store: return ix86_vec_cost (mode, COSTS_N_INSNS (ix86_cost->scatter_static + ix86_cost->scatter_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + * GET_MODE_NUNITS (mode)) / 2); case cond_branch_taken: return ix86_cost->cond_taken_branch_cost; @@ -25302,7 +25323,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_construct: { - int n = TYPE_VECTOR_SUBPARTS (vectype); + int n = GET_MODE_NUNITS (mode); /* N - 1 element inserts into an SSE vector, the possible GPR -> XMM move is accounted for in add_stmt_cost. */ if (GET_MODE_BITSIZE (mode) <= 128) @@ -25330,6 +25351,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* Implement targetm.vectorize.builtin_vectorization_cost. */ +static int +ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype, int) +{ + machine_mode mode = TImode; + if (vectype != NULL) + mode = TYPE_MODE (vectype); + return ix86_default_vector_cost (type_of_cost, mode); +} + /* This function returns the calling abi specific va_list type node. It returns the FNDECL specific va_list type. */ @@ -25783,7 +25815,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) unsigned ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_vec_info stmt_info, slp_tree node, - tree vectype, int misalign, + tree vectype, int, vect_cost_model_location where) { unsigned retval = 0; @@ -26122,32 +26154,24 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, (AGU and load ports). Try to account for this by scaling the construction cost by the number of elements involved. */ if ((kind == vec_construct || kind == vec_to_scalar) - && ((stmt_info - && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type - || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) - && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE - && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) + && ((node + && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE + || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP + && SLP_TREE_LANES (node) == 1)) + && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF + (SLP_TREE_REPRESENTATIVE (node)))) != INTEGER_CST)) - || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) - == VMAT_GATHER_SCATTER))) - || (node - && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE - || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP - && SLP_TREE_LANES (node) == 1)) - && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF - (SLP_TREE_REPRESENTATIVE (node)))) - != INTEGER_CST)) - || (SLP_TREE_MEMORY_ACCESS_TYPE (node) - == VMAT_GATHER_SCATTER))))) - { - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + || (SLP_TREE_MEMORY_ACCESS_TYPE (node) + == VMAT_GATHER_SCATTER))))) + { + stmt_cost = ix86_default_vector_cost (kind, mode); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); } else if ((kind == vec_construct || kind == scalar_to_vec) && node && SLP_TREE_DEF_TYPE (node) == vect_external_def) { - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost = ix86_default_vector_cost (kind, mode); unsigned i; tree op; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) @@ -26211,7 +26235,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, TREE_VISITED (op) = 0; } if (stmt_cost == -1) - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost = ix86_default_vector_cost (kind, mode); if (kind == vec_perm && vectype && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index eb52699..a50475b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -2968,7 +2968,8 @@ (match_operand:SWI248 1 "const_int_operand"))] "optimize_insn_for_size_p () && optimize_size > 1 && operands[1] != const0_rtx - && operands[1] != constm1_rtx + && (operands[1] != constm1_rtx + || (<MODE>mode == DImode && LEGACY_INT_REG_P (operands[0]))) && IN_RANGE (INTVAL (operands[1]), -128, 127) && !ix86_red_zone_used && REGNO (operands[0]) != SP_REG" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d88c3d6..ec74f93 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -21729,6 +21729,19 @@ (const_string "orig"))) (set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")]) +;; Eliminate redundancy caused by +;; /* Special case TImode to 128-bit vector conversions via V2DI. */ +;; in ix86_expand_vector_move + +(define_split + [(set (match_operand:V2DI 0 "register_operand") + (vec_concat:V2DI + (subreg:DI (match_operand:TI 1 "register_operand") 0) + (subreg:DI (match_dup 1) 8)))] + "TARGET_SSE2 && ix86_pre_reload_split ()" + [(set (match_dup 0) + (subreg:V2DI (match_dup 1) 0))]) + (define_insn "*vec_concatv2di_0" [(set (match_operand:V2DI 0 "register_operand" "=v,v ,x") (vec_concat:V2DI diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index b00fcc7..493f95e 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -11052,17 +11052,21 @@ static bool loongarch_builtin_support_vector_misalignment (machine_mode mode, const_tree type, int misalignment, - bool is_packed) + bool is_packed, + bool is_gather_scatter) { if ((ISA_HAS_LSX || ISA_HAS_LASX) && STRICT_ALIGNMENT) { + if (is_gather_scatter) + return true; if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing) return false; if (misalignment == -1) return false; } return default_builtin_support_vector_misalignment (mode, type, misalignment, - is_packed); + is_packed, + is_gather_scatter); } /* Return a PARALLEL containing NELTS elements, with element I equal diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h index d897763..5fc8665 100644 --- a/gcc/config/loongarch/loongarch.h +++ b/gcc/config/loongarch/loongarch.h @@ -823,8 +823,6 @@ typedef struct { #define CASE_VECTOR_MODE Pmode -#define CASE_VECTOR_SHORTEN_MODE(MIN, MAX, BODY) Pmode - /* Define this as 1 if `char' should by default be signed; else as 0. */ #ifndef DEFAULT_SIGNED_CHAR #define DEFAULT_SIGNED_CHAR 1 diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index d326ca4..9796839 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -120,6 +120,51 @@ Target RejectNegative Alias(misa=,sm_89) march-map=sm_90a Target RejectNegative Alias(misa=,sm_89) +march-map=sm_100 +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_100f +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_100a +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_101 +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_101f +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_101a +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_103 +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_103f +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_103a +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_120 +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_120f +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_120a +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_121 +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_121f +Target RejectNegative Alias(misa=,sm_89) + +march-map=sm_121a +Target RejectNegative Alias(misa=,sm_89) + Enum Name(ptx_version) Type(enum ptx_version) Known PTX ISA versions (for use with the -mptx= option): diff --git a/gcc/config/pru/pru-pragma.cc b/gcc/config/pru/pru-pragma.cc index c3f3d33..9338780 100644 --- a/gcc/config/pru/pru-pragma.cc +++ b/gcc/config/pru/pru-pragma.cc @@ -46,21 +46,24 @@ pru_pragma_ctable_entry (cpp_reader *) enum cpp_ttype type; type = pragma_lex (&ctable_index); - if (type == CPP_NUMBER && tree_fits_uhwi_p (ctable_index)) + if (type == CPP_NUMBER && tree_fits_shwi_p (ctable_index)) { type = pragma_lex (&base_addr); - if (type == CPP_NUMBER && tree_fits_uhwi_p (base_addr)) + if (type == CPP_NUMBER && tree_fits_shwi_p (base_addr)) { - unsigned HOST_WIDE_INT i = tree_to_uhwi (ctable_index); - unsigned HOST_WIDE_INT base = tree_to_uhwi (base_addr); + HOST_WIDE_INT i = tree_to_shwi (ctable_index); + HOST_WIDE_INT base = sext_hwi (tree_to_shwi (base_addr), + POINTER_SIZE); type = pragma_lex (&base_addr); if (type != CPP_EOF) error ("junk at end of %<#pragma CTABLE_ENTRY%>"); - else if (i >= ARRAY_SIZE (pru_ctable)) + else if (!IN_RANGE (i, 0, ARRAY_SIZE (pru_ctable) - 1)) error ("%<CTABLE_ENTRY%> index %wd is not valid", i); else if (pru_ctable[i].valid && pru_ctable[i].base != base) error ("redefinition of %<CTABLE_ENTRY %wd%>", i); + else if (!IN_RANGE (base, INT32_MIN, INT32_MAX)) + error ("%<CTABLE_ENTRY%> base address does not fit in 32-bits"); else { if (base & 0xff) diff --git a/gcc/config/pru/pru-protos.h b/gcc/config/pru/pru-protos.h index c73fad8..4750f0e 100644 --- a/gcc/config/pru/pru-protos.h +++ b/gcc/config/pru/pru-protos.h @@ -23,7 +23,7 @@ struct pru_ctable_entry { bool valid; - unsigned HOST_WIDE_INT base; + HOST_WIDE_INT base; }; extern struct pru_ctable_entry pru_ctable[32]; @@ -66,9 +66,9 @@ pru_regno_ok_for_index_p (int regno, bool strict_p) return pru_regno_ok_for_base_p (regno, strict_p); } -extern int pru_get_ctable_exact_base_index (unsigned HOST_WIDE_INT caddr); -extern int pru_get_ctable_base_index (unsigned HOST_WIDE_INT caddr); -extern int pru_get_ctable_base_offset (unsigned HOST_WIDE_INT caddr); +extern int pru_get_ctable_exact_base_index (HOST_WIDE_INT caddr); +extern int pru_get_ctable_base_index (HOST_WIDE_INT caddr); +extern int pru_get_ctable_base_offset (HOST_WIDE_INT caddr); extern int pru_symref2ioregno (rtx op); diff --git a/gcc/config/pru/pru.cc b/gcc/config/pru/pru.cc index 47e5f24..322e319 100644 --- a/gcc/config/pru/pru.cc +++ b/gcc/config/pru/pru.cc @@ -1428,7 +1428,7 @@ pru_valid_const_ubyte_offset (machine_mode mode, HOST_WIDE_INT offset) /* Recognize a CTABLE base address. Return CTABLE entry index, or -1 if base was not found in the pragma-filled pru_ctable. */ int -pru_get_ctable_exact_base_index (unsigned HOST_WIDE_INT caddr) +pru_get_ctable_exact_base_index (HOST_WIDE_INT caddr) { unsigned int i; @@ -1444,7 +1444,7 @@ pru_get_ctable_exact_base_index (unsigned HOST_WIDE_INT caddr) /* Check if the given address can be addressed via CTABLE_BASE + UBYTE_OFFS, and return the base CTABLE index if possible. */ int -pru_get_ctable_base_index (unsigned HOST_WIDE_INT caddr) +pru_get_ctable_base_index (HOST_WIDE_INT caddr) { unsigned int i; @@ -1461,7 +1461,7 @@ pru_get_ctable_base_index (unsigned HOST_WIDE_INT caddr) /* Return the offset from some CTABLE base for this address. */ int -pru_get_ctable_base_offset (unsigned HOST_WIDE_INT caddr) +pru_get_ctable_base_offset (HOST_WIDE_INT caddr) { int i; @@ -2004,7 +2004,7 @@ pru_print_operand_address (FILE *file, machine_mode mode, rtx op) case CONST_INT: { - unsigned HOST_WIDE_INT caddr = INTVAL (op); + HOST_WIDE_INT caddr = INTVAL (op); int base = pru_get_ctable_base_index (caddr); int offs = pru_get_ctable_base_offset (caddr); if (base < 0) diff --git a/gcc/config/riscv/arch-canonicalize b/gcc/config/riscv/arch-canonicalize index fd55255..34dad45 100755 --- a/gcc/config/riscv/arch-canonicalize +++ b/gcc/config/riscv/arch-canonicalize @@ -32,7 +32,7 @@ import itertools from functools import reduce SUPPORTED_ISA_SPEC = ["2.2", "20190608", "20191213"] -CANONICAL_ORDER = "imafdgqlcbkjtpvn" +CANONICAL_ORDER = "imafdqlcbkjtpvnh" LONG_EXT_PREFIXES = ['z', 's', 'h', 'x'] # diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md index f372f0e..6531996 100644 --- a/gcc/config/riscv/autovec-opt.md +++ b/gcc/config/riscv/autovec-opt.md @@ -1714,6 +1714,74 @@ } [(set_attr "type" "vialu")]) +(define_insn_and_split "*<sat_op_v_vdup>_vx_<mode>" + [(set (match_operand:V_VLSI 0 "register_operand") + (if_then_else:V_VLSI + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand") + (match_operand 5 "vector_length_operand") + (match_operand 6 "const_int_operand") + (match_operand 7 "const_int_operand") + (match_operand 8 "const_int_operand") + (match_operand 9 "const_int_operand") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM) + (reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE) + (unspec:V_VLSI + [(match_operand:V_VLSI 3 "register_operand") + (vec_duplicate:V_VLSI + (match_operand:<VEL> 4 "reg_or_int_operand"))] VSAT_VX_OP_V_VDUP) + (unspec:V_VLSI + [(match_operand:DI 2 "register_operand")] UNSPEC_VUNDEF)))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + int vxrm_val = INTVAL (operands[9]); + riscv_vector::expand_vx_binary_vxrm_vec_vec_dup (operands[0], operands[3], + operands[4], + <VSAT_VX_OP_V_VDUP>, + vxrm_val, <MODE>mode); + + DONE; + } + [(set_attr "type" "vaalu")]) + +(define_insn_and_split "*<sat_op_vdup_v>_vx_<mode>" + [(set (match_operand:V_VLSI 0 "register_operand") + (if_then_else:V_VLSI + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand") + (match_operand 5 "vector_length_operand") + (match_operand 6 "const_int_operand") + (match_operand 7 "const_int_operand") + (match_operand 8 "const_int_operand") + (match_operand 9 "const_int_operand") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM) + (reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE) + (unspec:V_VLSI + [(vec_duplicate:V_VLSI + (match_operand:<VEL> 4 "reg_or_int_operand")) + (match_operand:V_VLSI 3 "register_operand")] VSAT_VX_OP_VDUP_V) + (unspec:V_VLSI + [(match_operand:DI 2 "register_operand")] UNSPEC_VUNDEF)))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + int vxrm_val = INTVAL (operands[9]); + riscv_vector::expand_vx_binary_vxrm_vec_dup_vec (operands[0], operands[3], + operands[4], + <VSAT_VX_OP_VDUP_V>, + vxrm_val, <MODE>mode); + + DONE; + } + [(set_attr "type" "vaalu")]) + ;; ============================================================================= ;; Combine vec_duplicate + op.vv to op.vf ;; Include @@ -1838,8 +1906,58 @@ emit_insn (gen_extend<vsubel><vel>2(tmp, operands[1])); rtx ops[] = {operands[0], tmp}; - riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), - riscv_vector::UNARY_OP, ops); + riscv_vector::expand_broadcast (<MODE>mode, ops); + DONE; + } + [(set_attr "type" "vfwmuladd")] +) + +;; vfwnmacc.vf +(define_insn_and_split "*vfwnmacc_vf_<mode>" + [(set (match_operand:VWEXTF 0 "register_operand") + (minus:VWEXTF + (mult:VWEXTF + (neg:VWEXTF + (vec_duplicate:VWEXTF + (float_extend:<VEL> + (match_operand:<VSUBEL> 2 "register_operand")))) + (float_extend:VWEXTF + (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand"))) + (match_operand:VWEXTF 1 "register_operand")))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + rtx ops[] = {operands[0], operands[1], operands[2], operands[3]}; + riscv_vector::emit_vlmax_insn( + code_for_pred_widen_mul_neg_scalar(MINUS, <MODE>mode), + riscv_vector::WIDEN_TERNARY_OP_FRM_DYN, ops); + DONE; + } + [(set_attr "type" "vfwmuladd")] +) + +;; vfwnmsac.vf +(define_insn_and_split "*vfwnmsac_vf_<mode>" + [(set (match_operand:VWEXTF 0 "register_operand") + (minus:VWEXTF + (match_operand:VWEXTF 1 "register_operand") + (mult:VWEXTF + (float_extend:VWEXTF + (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand")) + (vec_duplicate:VWEXTF + (float_extend:<VEL> + (match_operand:<VSUBEL> 2 "register_operand"))))))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + rtx ops[] = {operands[0], operands[1], operands[2], operands[3]}; + riscv_vector::emit_vlmax_insn( + code_for_pred_widen_mul_neg_scalar (PLUS, <MODE>mode), + riscv_vector::WIDEN_TERNARY_OP_FRM_DYN, ops); DONE; } [(set_attr "type" "vfwmuladd")] diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md index 2e86826..48de5ef 100644 --- a/gcc/config/riscv/autovec.md +++ b/gcc/config/riscv/autovec.md @@ -1359,9 +1359,7 @@ if (operands[2] == const0_rtx) { rtx ops[] = {operands[0], operands[0], operands[1]}; - riscv_vector::emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode), - riscv_vector::SCALAR_MOVE_MERGED_OP_TU, - ops, CONST1_RTX (Pmode)); + riscv_vector::expand_set_first_tu (<MODE>mode, ops); } else { @@ -1385,8 +1383,7 @@ VL we need for the slide. */ rtx tmp = gen_reg_rtx (<MODE>mode); rtx ops1[] = {tmp, operands[1]}; - emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode), - riscv_vector::UNARY_OP, ops1, length); + riscv_vector::expand_broadcast (<MODE>mode, ops1, length); /* Slide exactly one element up leaving the tail elements unchanged. */ @@ -2489,7 +2486,8 @@ (sign_extend:VWEXTI (match_operand:<V_DOUBLE_TRUNC> 1 "register_operand")) (sign_extend:VWEXTI - (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand"))))))] + (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand"))) + (const_int 1))))] "TARGET_VECTOR" { insn_code icode = code_for_pred (UNSPEC_VAADD, <V_DOUBLE_TRUNC>mode); @@ -2522,7 +2520,8 @@ (match_operand:<V_DOUBLE_TRUNC> 1 "register_operand")) (sign_extend:VWEXTI (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand"))) - (const_int 1)))))] + (const_int 1)) + (const_int 1))))] "TARGET_VECTOR" { insn_code icode = code_for_pred (UNSPEC_VAADD, <V_DOUBLE_TRUNC>mode); @@ -2532,6 +2531,19 @@ } ) +(define_expand "avg<mode>3_ceil" + [(match_operand:V_VLSI_D 0 "register_operand") + (match_operand:V_VLSI_D 1 "register_operand") + (match_operand:V_VLSI_D 2 "register_operand")] + "TARGET_VECTOR" + { + insn_code icode = code_for_pred (UNSPEC_VAADD, <MODE>mode); + riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP_VXRM_RNU, + operands); + DONE; + } +) + ;; csrwi vxrm, 2 ;; vaaddu.vv vd, vs2, vs1 (define_expand "uavg<mode>3_floor" diff --git a/gcc/config/riscv/gen-riscv-mcpu-texi.cc b/gcc/config/riscv/gen-riscv-mcpu-texi.cc new file mode 100644 index 0000000..9681438 --- /dev/null +++ b/gcc/config/riscv/gen-riscv-mcpu-texi.cc @@ -0,0 +1,43 @@ +#include <string> +#include <vector> +#include <stdio.h> + +int +main () +{ + puts ("@c Copyright (C) 2025 Free Software Foundation, Inc."); + puts ("@c This is part of the GCC manual."); + puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi."); + puts (""); + puts ("@c This file is generated automatically using"); + puts ("@c gcc/config/riscv/gen-riscv-mcpu-texi.cc from:"); + puts ("@c gcc/config/riscv/riscv-cores.def"); + puts (""); + puts ("@c Please *DO NOT* edit manually."); + puts (""); + puts ("@samp{Core Name}"); + puts (""); + puts ("@opindex mcpu"); + puts ("@item -mcpu=@var{processor-string}"); + puts ("Use architecture of and optimize the output for the given processor, specified"); + puts ("by particular CPU name. Permissible values for this option are:"); + puts (""); + puts (""); + + std::vector<std::string> coreNames; + +#define RISCV_CORE(CORE_NAME, ARCH, MICRO_ARCH) \ + coreNames.push_back (CORE_NAME); +#include "riscv-cores.def" +#undef RISCV_CORE + + for (size_t i = 0; i < coreNames.size(); ++i) { + if (i == coreNames.size() - 1) { + printf("@samp{%s}.\n", coreNames[i].c_str()); + } else { + printf("@samp{%s},\n\n", coreNames[i].c_str()); + } + } + + return 0; +} diff --git a/gcc/config/riscv/gen-riscv-mtune-texi.cc b/gcc/config/riscv/gen-riscv-mtune-texi.cc new file mode 100644 index 0000000..1bdfe2a --- /dev/null +++ b/gcc/config/riscv/gen-riscv-mtune-texi.cc @@ -0,0 +1,41 @@ +#include <string> +#include <vector> +#include <stdio.h> + +int +main () +{ + puts ("@c Copyright (C) 2025 Free Software Foundation, Inc."); + puts ("@c This is part of the GCC manual."); + puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi."); + puts (""); + puts ("@c This file is generated automatically using"); + puts ("@c gcc/config/riscv/gen-riscv-mtune-texi.cc from:"); + puts ("@c gcc/config/riscv/riscv-cores.def"); + puts (""); + puts ("@c Please *DO NOT* edit manually."); + puts (""); + puts ("@samp{Tune Name}"); + puts (""); + puts ("@opindex mtune"); + puts ("@item -mtune=@var{processor-string}"); + puts ("Optimize the output for the given processor, specified by microarchitecture or"); + puts ("particular CPU name. Permissible values for this option are:"); + puts (""); + puts (""); + + std::vector<std::string> tuneNames; + +#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO) \ + tuneNames.push_back (TUNE_NAME); +#include "riscv-cores.def" +#undef RISCV_TUNE + + for (size_t i = 0; i < tuneNames.size(); ++i) { + printf("@samp{%s},\n\n", tuneNames[i].c_str()); + } + + puts ("and all valid options for @option{-mcpu=}."); + + return 0; +} diff --git a/gcc/config/riscv/generic-vector-ooo.md b/gcc/config/riscv/generic-vector-ooo.md index ab9e57f..773003b 100644 --- a/gcc/config/riscv/generic-vector-ooo.md +++ b/gcc/config/riscv/generic-vector-ooo.md @@ -17,6 +17,9 @@ ;; <http://www.gnu.org/licenses/>. ;; Vector load/store +;; The insn reservations include "generic" as we won't have a in-order +;; generic definition for vector instructions. + (define_automaton "vector_ooo") ;; Separate issue queue for vector instructions. @@ -29,119 +32,141 @@ (define_cpu_unit "vxu_ooo_multicycle" "vector_ooo") (define_insn_reservation "vec_load" 6 - (eq_attr "type" "vlde,vldm,vlds,vldux,vldox,vldff,vldr") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vlde,vldm,vlds,vldux,vldox,vldff,vldr")) "vxu_ooo_issue,vxu_ooo_alu") (define_insn_reservation "vec_store" 6 - (eq_attr "type" "vste,vstm,vsts,vstux,vstox,vstr") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vste,vstm,vsts,vstux,vstox,vstr")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector segment loads/stores. (define_insn_reservation "vec_loadstore_seg" 10 - (eq_attr "type" "vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\ - vssegte,vssegts,vssegtux,vssegtox") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\ + vssegte,vssegts,vssegtux,vssegtox")) "vxu_ooo_issue,vxu_ooo_alu") ;; Regular vector operations and integer comparisons. (define_insn_reservation "vec_alu" 3 - (eq_attr "type" "vialu,viwalu,vext,vicalu,vshift,vnshift,viminmax,vicmp,\ - vimov,vsalu,vaalu,vsshift,vnclip,vmov,vfmov,vector,\ - vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,vror,vwsll") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vialu,viwalu,vext,vicalu,vshift,vnshift,viminmax,vicmp,\ + vimov,vsalu,vaalu,vsshift,vnclip,vmov,vfmov,vector,\ + vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,vror,vwsll")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector float comparison, conversion etc. (define_insn_reservation "vec_fcmp" 3 - (eq_attr "type" "vfrecp,vfminmax,vfcmp,vfsgnj,vfclass,vfcvtitof,\ - vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,\ - vfncvtftoi,vfncvtftof,vfncvtbf16,vfwcvtbf16") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfrecp,vfminmax,vfcmp,vfsgnj,vfclass,vfcvtitof,\ + vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,\ + vfncvtftoi,vfncvtftof,vfncvtbf16,vfwcvtbf16")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector integer multiplication. (define_insn_reservation "vec_imul" 4 - (eq_attr "type" "vimul,viwmul,vimuladd,viwmuladd,vsmul,vclmul,vclmulh,\ - vghsh,vgmul") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vimul,viwmul,vimuladd,viwmuladd,vsmul,vclmul,vclmulh,\ + vghsh,vgmul")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector float addition. (define_insn_reservation "vec_fadd" 4 - (eq_attr "type" "vfalu,vfwalu") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfalu,vfwalu")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector float multiplication and FMA. (define_insn_reservation "vec_fmul" 6 - (eq_attr "type" "vfmul,vfwmul,vfmuladd,vfwmuladd,vfwmaccbf16,sf_vqmacc,sf_vfnrclip") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfmul,vfwmul,vfmuladd,vfwmuladd,vfwmaccbf16,sf_vqmacc,sf_vfnrclip")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector crypto, assumed to be a generic operation for now. (define_insn_reservation "vec_crypto" 4 - (eq_attr "type" "crypto,vclz,vctz,vcpop") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "crypto,vclz,vctz,vcpop")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector crypto, AES (define_insn_reservation "vec_crypto_aes" 4 - (eq_attr "type" "vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector crypto, sha (define_insn_reservation "vec_crypto_sha" 4 - (eq_attr "type" "vsha2ms,vsha2ch,vsha2cl") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vsha2ms,vsha2ch,vsha2cl")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector crypto, SM3/4 (define_insn_reservation "vec_crypto_sm" 4 - (eq_attr "type" "vsm4k,vsm4r,vsm3me,vsm3c") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vsm4k,vsm4r,vsm3me,vsm3c")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector permute. (define_insn_reservation "vec_perm" 3 - (eq_attr "type" "vimerge,vfmerge,vslideup,vslidedown,vislide1up,\ - vislide1down,vfslide1up,vfslide1down,vgather,vcompress") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vimerge,vfmerge,vslideup,vslidedown,vislide1up,\ + vislide1down,vfslide1up,vfslide1down,vgather,vcompress")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector reduction. (define_insn_reservation "vec_reduction" 8 - (eq_attr "type" "vired,viwred,vfredu,vfwredu") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vired,viwred,vfredu,vfwredu")) "vxu_ooo_issue,vxu_ooo_multicycle") ;; Vector ordered reduction, assume the latency number is for ;; a 128-bit vector. It is scaled in riscv_sched_adjust_cost ;; for larger vectors. (define_insn_reservation "vec_ordered_reduction" 10 - (eq_attr "type" "vfredo,vfwredo") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfredo,vfwredo")) "vxu_ooo_issue,vxu_ooo_multicycle*3") ;; Vector integer division, assume not pipelined. (define_insn_reservation "vec_idiv" 16 - (eq_attr "type" "vidiv") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vidiv")) "vxu_ooo_issue,vxu_ooo_multicycle*3") ;; Vector float divisions and sqrt, assume not pipelined. (define_insn_reservation "vec_float_divsqrt" 16 - (eq_attr "type" "vfdiv,vfsqrt") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfdiv,vfsqrt")) "vxu_ooo_issue,vxu_ooo_multicycle*3") ;; Vector mask operations. (define_insn_reservation "vec_mask" 2 - (eq_attr "type" "vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,\ - vfmovvf,vfmovfv") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,\ + vfmovvf,vfmovfv")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector vsetvl. (define_insn_reservation "vec_vesetvl" 1 - (eq_attr "type" "vsetvl,vsetvl_pre") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vsetvl,vsetvl_pre")) "vxu_ooo_issue") ;; Vector rounding mode setters, assume pipeline barrier. (define_insn_reservation "vec_setrm" 20 - (eq_attr "type" "wrvxrm,wrfrm") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "wrvxrm,wrfrm")) "vxu_ooo_issue,vxu_ooo_issue*3") ;; Vector read vlen/vlenb. (define_insn_reservation "vec_readlen" 4 - (eq_attr "type" "rdvlenb,rdvl") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "rdvlenb,rdvl")) "vxu_ooo_issue,vxu_ooo_issue") ;; Vector sf_vcp. (define_insn_reservation "vec_sf_vcp" 2 - (eq_attr "type" "sf_vc,sf_vc_se") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "sf_vc,sf_vc_se")) "vxu_ooo_issue") diff --git a/gcc/config/riscv/mips-p8700.md b/gcc/config/riscv/mips-p8700.md index ae0ea8d..fac9abb 100644 --- a/gcc/config/riscv/mips-p8700.md +++ b/gcc/config/riscv/mips-p8700.md @@ -163,5 +163,5 @@ vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll, vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz, vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16, - sf_vc,sf_vc_se")) + sf_vc,sf_vc_se,ghost")) "mips_p8700_dummies") diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index 1f9a6b5..381f96c 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -518,6 +518,10 @@ (define_predicate "vector_broadcast_mask_operand" (ior (match_operand 0 "vector_least_significant_set_mask_operand") + (match_operand 0 "vector_all_trues_mask_operand"))) + +(define_predicate "strided_broadcast_mask_operand" + (ior (match_operand 0 "vector_least_significant_set_mask_operand") (ior (match_operand 0 "register_operand") (match_operand 0 "vector_all_trues_mask_operand")))) @@ -619,6 +623,15 @@ (define_predicate "direct_broadcast_operand" (match_test "riscv_vector::can_be_broadcast_p (op)")) +;; A strided broadcast is just a fallback pattern that loads from +;; memory. +(define_predicate "strided_broadcast_operand" + (match_test "riscv_vector::strided_broadcast_p (op)")) + +(define_predicate "any_broadcast_operand" + (ior (match_operand 0 "direct_broadcast_operand") + (match_operand 0 "strided_broadcast_operand"))) + ;; A CONST_INT operand that has exactly two bits cleared. (define_predicate "const_nottwobits_operand" (and (match_code "const_int") diff --git a/gcc/config/riscv/riscv-ext.def b/gcc/config/riscv/riscv-ext.def index 6fc6d38..09f18ad 100644 --- a/gcc/config/riscv/riscv-ext.def +++ b/gcc/config/riscv/riscv-ext.def @@ -80,8 +80,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({}), /* SUPPORTED_VERSIONS */ ({{2, 0}}), /* FLAG_GROUP */ base, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 0, + /* BITMASK_BIT_POSITION*/ 4, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -190,8 +190,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({"zba", "zbb", "zbs"}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ base, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 0, + /* BITMASK_BIT_POSITION*/ 1, /* EXTRA_EXTENSION_FLAGS */ EXT_FLAG_MACRO) DEFINE_RISCV_EXT( @@ -216,8 +216,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ base, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 0, + /* BITMASK_BIT_POSITION*/ 7, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -398,8 +398,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({}), /* SUPPORTED_VERSIONS */ ({{2, 0}}), /* FLAG_GROUP */ zi, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 1, + /* BITMASK_BIT_POSITION*/ 11, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -464,7 +464,7 @@ DEFINE_RISCV_EXT( /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ zi, /* BITMASK_GROUP_ID */ 1, - /* BITMASK_BIT_POSITION*/ 1, + /* BITMASK_BIT_POSITION*/ 8, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -476,8 +476,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ zm, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 1, + /* BITMASK_BIT_POSITION*/ 12, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -787,8 +787,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({"zca"}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ zc, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 1, + /* BITMASK_BIT_POSITION*/ 10, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -813,8 +813,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({"zca", "zilsd"}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ zc, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 1, + /* BITMASK_BIT_POSITION*/ 9, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index a41c4c2..539321f 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -414,8 +414,14 @@ enum insn_flags : unsigned int /* Means INSN has VXRM operand and the value is VXRM_RNU. */ VXRM_RNU_P = 1 << 20, + /* Means INSN has VXRM operand and the value is VXRM_RNE. */ + VXRM_RNE_P = 1 << 21, + /* Means INSN has VXRM operand and the value is VXRM_RDN. */ - VXRM_RDN_P = 1 << 21, + VXRM_RDN_P = 1 << 22, + + /* Means INSN has VXRM operand and the value is VXRM_ROD. */ + VXRM_ROD_P = 1 << 23, }; enum insn_type : unsigned int @@ -477,7 +483,9 @@ enum insn_type : unsigned int BINARY_OP_TUMA = __MASK_OP_TUMA | BINARY_OP_P, BINARY_OP_FRM_DYN = BINARY_OP | FRM_DYN_P, BINARY_OP_VXRM_RNU = BINARY_OP | VXRM_RNU_P, + BINARY_OP_VXRM_RNE = BINARY_OP | VXRM_RNE_P, BINARY_OP_VXRM_RDN = BINARY_OP | VXRM_RDN_P, + BINARY_OP_VXRM_ROD = BINARY_OP | VXRM_ROD_P, /* Ternary operator. Always have real merge operand. */ TERNARY_OP = HAS_DEST_P | HAS_MASK_P | USE_ALL_TRUES_MASK_P | HAS_MERGE_P @@ -672,6 +680,8 @@ void expand_vec_oct_sstrunc (rtx, rtx, machine_mode, machine_mode, machine_mode); void expand_vx_binary_vec_dup_vec (rtx, rtx, rtx, rtx_code, machine_mode); void expand_vx_binary_vec_vec_dup (rtx, rtx, rtx, rtx_code, machine_mode); +void expand_vx_binary_vxrm_vec_vec_dup (rtx, rtx, rtx, int, int, machine_mode); +void expand_vx_binary_vxrm_vec_dup_vec (rtx, rtx, rtx, int, int, machine_mode); #endif bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode, bool, void (*)(rtx *, rtx), enum avl_type); @@ -695,6 +705,9 @@ bool expand_block_move (rtx, rtx, rtx, bool); machine_mode preferred_simd_mode (scalar_mode); machine_mode get_mask_mode (machine_mode); void expand_vec_series (rtx, rtx, rtx, rtx = 0); +void expand_broadcast (machine_mode, rtx *, rtx = 0); +void expand_set_first (machine_mode, rtx *, rtx = 0); +void expand_set_first_tu (machine_mode, rtx *, rtx = 0); void expand_vec_init (rtx, rtx); void expand_vec_perm (rtx, rtx, rtx, rtx); void expand_select_vl (rtx *); @@ -762,6 +775,7 @@ enum vlmul_type get_vlmul (rtx_insn *); int count_regno_occurrences (rtx_insn *, unsigned int); bool imm_avl_p (machine_mode); bool can_be_broadcast_p (rtx); +bool strided_broadcast_p (rtx); bool gather_scatter_valid_offset_p (machine_mode); HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int); bool whole_reg_to_reg_move_p (rtx *, machine_mode, int); diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc index 9080189..61c4a09 100644 --- a/gcc/config/riscv/riscv-string.cc +++ b/gcc/config/riscv/riscv-string.cc @@ -1625,16 +1625,14 @@ expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in) Otherwise, use a predicated store. */ if (known_eq (GET_MODE_SIZE (info.vmode), INTVAL (info.avl))) { - emit_vlmax_insn (code_for_pred_broadcast (info.vmode), UNARY_OP, - broadcast_ops); + riscv_vector::expand_broadcast (info.vmode, broadcast_ops); emit_move_insn (dst, fill_value); } else { if (!satisfies_constraint_vl (info.avl)) info.avl = force_reg (Pmode, info.avl); - emit_nonvlmax_insn (code_for_pred_broadcast (info.vmode), - riscv_vector::UNARY_OP, broadcast_ops, info.avl); + riscv_vector::expand_broadcast (info.vmode, broadcast_ops, info.avl); machine_mode mask_mode = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (info.vmode)) .require (); diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 242ac08..c9c8328 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -351,9 +351,12 @@ public: add_rounding_mode_operand (FRM_RNE); else if (m_insn_flags & VXRM_RNU_P) add_rounding_mode_operand (VXRM_RNU); + else if (m_insn_flags & VXRM_RNE_P) + add_rounding_mode_operand (VXRM_RNE); else if (m_insn_flags & VXRM_RDN_P) add_rounding_mode_operand (VXRM_RDN); - + else if (m_insn_flags & VXRM_ROD_P) + add_rounding_mode_operand (VXRM_ROD); if (insn_data[(int) icode].n_operands != m_opno) internal_error ("invalid number of operands for insn %s, " @@ -1190,6 +1193,59 @@ expand_vector_init_trailing_same_elem (rtx target, return false; } +/* Helper function to emit a vmv.vx/vi and float variants. + If VL is not given a VLMAX insn will be emitted, otherwise + a non-VLMAX insn with length VL. + If the value to be broadcast is not suitable for vmv.vx + fall back to a vlse with zero stride. This itself has a + fallback if the uarch prefers not to use a strided load + for broadcast. */ + +void +expand_broadcast (machine_mode mode, rtx *ops, rtx vl) +{ + rtx elt = ops[1]; + avl_type type = vl ? NONVLMAX : VLMAX; + if (can_be_broadcast_p (elt)) + emit_avltype_insn (code_for_pred_broadcast (mode), UNARY_OP, ops, + type, vl); + else + emit_avltype_insn (code_for_pred_strided_broadcast (mode), + UNARY_OP, ops, type, vl); +} + +/* Similar to expand_broadcast but emits a vmv.s.x/vfmv.s.f instead. */ + +void +expand_set_first (machine_mode mode, rtx *ops, rtx vl) +{ + rtx elt = ops[1]; + avl_type type = vl ? NONVLMAX : VLMAX; + if (can_be_broadcast_p (elt)) + emit_avltype_insn (code_for_pred_broadcast (mode), + SCALAR_MOVE_OP, ops, type, vl); + else + emit_avltype_insn (code_for_pred_strided_broadcast (mode), + SCALAR_MOVE_OP, ops, type, vl); +} + +/* Similar to expand_set_first but keeping the tail elements + unchanged (TU) */ + +void +expand_set_first_tu (machine_mode mode, rtx *ops, rtx vl) +{ + rtx elt = ops[2]; + if (!vl) + vl = const1_rtx; + if (can_be_broadcast_p (elt)) + emit_nonvlmax_insn (code_for_pred_broadcast (mode), + SCALAR_MOVE_MERGED_OP_TU, ops, vl); + else + emit_nonvlmax_insn (code_for_pred_strided_broadcast (mode), + SCALAR_MOVE_MERGED_OP_TU, ops, vl); +} + static void expand_const_vec_duplicate (rtx target, rtx src, rtx elt) { @@ -1226,7 +1282,7 @@ expand_const_vec_duplicate (rtx target, rtx src, rtx elt) if (lra_in_progress) { rtx ops[] = {result, elt}; - emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops); + expand_broadcast (mode, ops); } else { @@ -1278,8 +1334,7 @@ expand_const_vector_duplicate_repeating (rtx target, rvv_builder *builder) { dup = gen_reg_rtx (builder->new_mode ()); rtx ops[] = {dup, ele}; - emit_vlmax_insn (code_for_pred_broadcast (builder->new_mode ()), - UNARY_OP, ops); + expand_broadcast (builder->new_mode (), ops); } else dup = expand_vector_broadcast (builder->new_mode (), ele); @@ -1322,8 +1377,7 @@ expand_const_vector_duplicate_default (rtx target, rvv_builder *builder) rtx tmp1 = gen_reg_rtx (builder->mode ()); rtx dup_ops[] = {tmp1, builder->elt (0)}; - emit_vlmax_insn (code_for_pred_broadcast (builder->mode ()), UNARY_OP, - dup_ops); + expand_broadcast (builder->mode (), dup_ops); for (unsigned int i = 1; i < builder->npatterns (); i++) { @@ -2136,18 +2190,32 @@ has_vi_variant_p (rtx_code code, rtx x) } } +/* This is a helper for binary ops with DImode scalar operands that are + broadcast (like vadd.vx v1, a1). + Instead of having similar code for all the expanders this function + unifies the handling. For 64-bit targets all we do is choose + between the vi variant (if available) and the register variant. + For 32-bit targets we either create the sign-extending variant + of vop.vx (when the immediate fits 32 bits) or emit a vector + broadcast of the 64-bit register/immediate and switch to a + vop.vv (replacing the scalar op with the broadcast vector. */ + bool sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, machine_mode vector_mode, bool has_vi_variant_p, void (*emit_vector_func) (rtx *, rtx), enum avl_type type) { machine_mode scalar_mode = GET_MODE_INNER (vector_mode); + + /* If the scalar broadcast op fits an immediate, use the + vop.vi variant if there is one. */ if (has_vi_variant_p) { *scalar_op = force_reg (scalar_mode, *scalar_op); return false; } + /* On a 64-bit target we can always use the vop.vx variant. */ if (TARGET_64BIT) { if (!rtx_equal_p (*scalar_op, const0_rtx)) @@ -2155,6 +2223,8 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, return false; } + /* For 32 bit and if there is no vop.vi variant for a 32-bit immediate + we need to use the sign-extending (SI -> DI) vop.vx variants. */ if (immediate_operand (*scalar_op, Pmode)) { if (!rtx_equal_p (*scalar_op, const0_rtx)) @@ -2164,40 +2234,29 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, return false; } - bool avoid_strided_broadcast = false; + /* Now we're left with a 64-bit immediate or a register. + We cannot use a vop.vx variant but must broadcast the value first + and switch to a vop.vv variant. + Broadcast can either be done via vlse64.v v1, reg, zero + or by loading one 64-bit element (vle64.v) and using a + broadcast vrgather.vi. This is decided when splitting + the strided broadcast insn. */ + gcc_assert (!TARGET_64BIT + && (CONST_INT_P (*scalar_op) + || register_operand (*scalar_op, scalar_mode))); + if (CONST_INT_P (*scalar_op)) { if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode))) - { - if (strided_load_broadcast_p ()) - *scalar_op = force_const_mem (scalar_mode, *scalar_op); - else - avoid_strided_broadcast = true; - } + *scalar_op = force_const_mem (scalar_mode, *scalar_op); else *scalar_op = force_reg (scalar_mode, *scalar_op); } rtx tmp = gen_reg_rtx (vector_mode); - if (!avoid_strided_broadcast) - { - rtx ops[] = {tmp, *scalar_op}; - emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops, - type, vl); - } - else - { - /* Load scalar as V1DI and broadcast via vrgather.vi. */ - rtx tmp1 = gen_reg_rtx (V1DImode); - emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op, - scalar_mode)); - tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode); - - rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)}; - emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode), - BINARY_OP, ops); - } - + rtx ops[] = {tmp, *scalar_op}; + emit_avltype_insn (code_for_pred_strided_broadcast (vector_mode), + UNARY_OP, ops, type, vl); emit_vector_func (operands, tmp); return true; @@ -2591,8 +2650,7 @@ expand_vector_init_merge_repeating_sequence (rtx target, /* Step 1: Broadcast the first pattern. */ rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))}; - emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), - UNARY_OP, ops); + expand_broadcast (builder.mode (), ops); /* Step 2: Merge the rest iteration of pattern. */ for (unsigned int i = 1; i < builder.npatterns (); i++) { @@ -2605,8 +2663,7 @@ expand_vector_init_merge_repeating_sequence (rtx target, if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */ { rtx ops[] = {dup, merge_mask}; - emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)), - SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode)); + expand_set_first (GET_MODE (dup), ops); } else /* vmv.v.x. */ { @@ -2614,8 +2671,7 @@ expand_vector_init_merge_repeating_sequence (rtx target, force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)}; rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()), Pmode); - emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP, - ops, vl); + expand_broadcast (mask_int_mode, ops, vl); } emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup)); @@ -4706,20 +4762,20 @@ expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe, rtx m1_tmp = gen_reg_rtx (m1_mode); rtx scalar_move_ops[] = {m1_tmp, init}; - insn_code icode = code_for_pred_broadcast (m1_mode); if (need_mask_operand_p (insn_flags)) { if (need_vl0_safe) - emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx); + expand_set_first (m1_mode, scalar_move_ops, const1_rtx); else - emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op); + expand_set_first (m1_mode, scalar_move_ops, vl_op); } else - emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops); + expand_set_first (m1_mode, scalar_move_ops); rtx m1_tmp2 = gen_reg_rtx (m1_mode); rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp}; + insn_code icode; if (need_vl0_safe) icode = code_for_pred (unspec_for_vl0_safe, vmode); else @@ -5597,6 +5653,82 @@ expand_vx_binary_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops); } +static enum insn_type +get_insn_type_by_vxrm_val (int vxrm_val) +{ + enum insn_type itype; + + switch (vxrm_val) + { + case VXRM_RNU: + itype = BINARY_OP_VXRM_RNU; + break; + case VXRM_RNE: + itype = BINARY_OP_VXRM_RNE; + break; + case VXRM_RDN: + itype = BINARY_OP_VXRM_RDN; + break; + case VXRM_ROD: + itype = BINARY_OP_VXRM_ROD; + break; + default: + gcc_unreachable (); + } + + return itype; +} + +/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)) + and its' vxrm value. Aka the second op comes from the vec_duplicate, + and the first op is the vector reg. */ + +void +expand_vx_binary_vxrm_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2, int unspec, + int vxrm_val, machine_mode mode) +{ + enum insn_code icode; + enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val); + rtx ops[] = {op_0, op_1, op_2}; + + switch (unspec) + { + case UNSPEC_VAADD: + case UNSPEC_VAADDU: + icode = code_for_pred_scalar (unspec, mode); + break; + default: + gcc_unreachable (); + } + + emit_vlmax_insn (icode, itype, ops); +} + +/* Expand the binary vx combine with the format like v2 = vop(vec_dup(x), v1) + and its' vxrm value. Aka the second op comes from the vec_duplicate, + and the first op is the vector reg. */ + +void +expand_vx_binary_vxrm_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, int unspec, + int vxrm_val, machine_mode mode) +{ + enum insn_code icode; + enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val); + rtx ops[] = {op_0, op_1, op_2}; + + switch (unspec) + { + case UNSPEC_VAADD: + case UNSPEC_VAADDU: + icode = code_for_pred_scalar (unspec, mode); + break; + default: + gcc_unreachable (); + } + + emit_vlmax_insn (icode, itype, ops); +} + /* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)). Aka the second op comes from the vec_duplicate, and the first op is the vector reg. */ @@ -5808,25 +5940,84 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno) return count; } -/* Return true if the OP can be directly broadcast. */ +/* Return true if the OP can be broadcast with a + v[f]mv.v.[xif] instruction. */ + bool can_be_broadcast_p (rtx op) { machine_mode mode = GET_MODE (op); - /* We don't allow RA (register allocation) reload generate - (vec_duplicate:DI reg) in RV32 system wheras we allow - (vec_duplicate:DI mem) in RV32 system. */ - if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode) - && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode)) - && !satisfies_constraint_Wdm (op)) + + /* Zero always works and we can always put an immediate into a + register. + What's tricky is that for an immediate we don't know the + register's mode it will end up in, i.e. what element size + we want to broadcast. So even if the immediate is small it might + still end up in a DImode register that we cannot broadcast. + vmv.s.x, i.e. a single-element set can handle this, though, + because it implicitly sign-extends to SEW. */ + if (rtx_equal_p (op, CONST0_RTX (mode)) + || const_int_operand (op, Xmode)) + return true; + + /* Do not accept DImode broadcasts on !TARGET_64BIT. Those + are handled by strided broadcast. */ + if (INTEGRAL_MODE_P (mode) + && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD)) + return false; + + /* Non-register operands that can be forced into a register we can + handle. These don't need to use strided broadcast. */ + if (INTEGRAL_MODE_P (mode) + && (memory_operand (op, mode) || CONST_POLY_INT_P (op)) + && can_create_pseudo_p ()) + return true; + + /* Likewise, do not accept HFmode broadcast if we don't have + vfmv.v.f for 16-bit registers available. */ + if (mode == HFmode && !TARGET_ZVFH) + return false; + + /* Same for float, just that we can always handle 64-bit doubles + even on !TARGET_64BIT. We have ruled out 16-bit HF already + above. */ + if (FLOAT_MODE_P (mode) + && (memory_operand (op, mode) || CONSTANT_P (op)) + && can_create_pseudo_p ()) + return true; + + /* After excluding all the cases we cannot handle the register types + that remain can always be broadcast. */ + if (register_operand (op, mode)) + return true; + + return false; +} + +/* Returns true for all operands that cannot use vmv.vx, vfmv.vf, + vmv.s.x, or vfmv.s.f but rather need to go via memory. */ + +bool +strided_broadcast_p (rtx op) +{ + machine_mode mode = GET_MODE (op); + if (!memory_operand (op, mode) + && !register_operand (op, mode) + && !rtx_equal_p (op, CONST0_RTX (mode)) + && !const_int_operand (op, mode)) return false; - if (satisfies_constraint_K (op) || register_operand (op, mode) - || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op)) - || rtx_equal_p (op, CONST0_RTX (mode))) + /* !TARGET64_BIT does not have a vmv.v.x/vmv.s.x for 64-bit + DImode elements. */ + if (INTEGRAL_MODE_P (mode) + && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD)) + return true; + + /* Zvfhmin does not have a vfmv.v.f/vfmv.s.f. for 16-bit elements. */ + if (!TARGET_ZVFH && mode == HFmode) return true; - return can_create_pseudo_p () && nonmemory_operand (op, mode); + return false; } void @@ -5941,7 +6132,10 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index) return false; } -/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */ +/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. + That's the case if we're dealing with a scalar broadcast that + has VL = 1. */ + bool splat_to_scalar_move_p (rtx *ops) { diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc index bf5172c..7e4d396 100644 --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc @@ -643,7 +643,8 @@ public: return e.use_exact_insn (code_for_pred_mov (e.vector_mode ())); case OP_TYPE_x: case OP_TYPE_f: - return e.use_exact_insn (code_for_pred_broadcast (e.vector_mode ())); + return e.use_scalar_broadcast_insn + (code_for_pred_broadcast (e.vector_mode ())); default: gcc_unreachable (); } diff --git a/gcc/config/riscv/riscv-vector-builtins.cc b/gcc/config/riscv/riscv-vector-builtins.cc index 8810af0..0db7549 100644 --- a/gcc/config/riscv/riscv-vector-builtins.cc +++ b/gcc/config/riscv/riscv-vector-builtins.cc @@ -4753,7 +4753,10 @@ function_expander::use_ternop_insn (bool vd_accum_p, insn_code icode) } /* Implement the call using instruction ICODE, with a 1:1 mapping between - arguments and input operands. */ + arguments and input operands. + There are operands that cannot be broadcast using v[f]mv. In that case + we switch to a strided broadcast. */ + rtx function_expander::use_widen_ternop_insn (insn_code icode) { @@ -4794,7 +4797,10 @@ function_expander::use_widen_ternop_insn (insn_code icode) } /* Implement the call using instruction ICODE, with a 1:1 mapping between - arguments and input operands. */ + arguments and input operands. + There are operands that cannot be broadcast using v[f]mv. In that case + we switch to a strided broadcast. */ + rtx function_expander::use_scalar_move_insn (insn_code icode) { @@ -4812,6 +4818,37 @@ function_expander::use_scalar_move_insn (insn_code icode) for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++) add_input_operand (argno); + if (!can_be_broadcast_p (m_ops[3].value)) + icode = code_for_pred_strided_broadcast (vector_mode ()); + + add_input_operand (Pmode, get_tail_policy_for_pred (pred)); + add_input_operand (Pmode, get_mask_policy_for_pred (pred)); + add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX)); + return generate_insn (icode); +} + +/* Implement the call using instruction ICODE, with a 1:1 mapping between + arguments and input operands. */ +rtx +function_expander::use_scalar_broadcast_insn (insn_code icode) +{ + machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); + + /* Record the offset to get the argument. */ + int arg_offset = 0; + add_all_one_mask_operand (mask_mode ()); + + if (use_real_merge_p (pred)) + add_input_operand (arg_offset++); + else + add_vundef_operand (mode); + + for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++) + add_input_operand (argno); + + if (!can_be_broadcast_p (m_ops[3].value)) + icode = code_for_pred_strided_broadcast (vector_mode ()); + add_input_operand (Pmode, get_tail_policy_for_pred (pred)); add_input_operand (Pmode, get_mask_policy_for_pred (pred)); add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX)); diff --git a/gcc/config/riscv/riscv-vector-builtins.h b/gcc/config/riscv/riscv-vector-builtins.h index 1f2587a..86d8115 100644 --- a/gcc/config/riscv/riscv-vector-builtins.h +++ b/gcc/config/riscv/riscv-vector-builtins.h @@ -497,6 +497,7 @@ public: rtx use_ternop_insn (bool, insn_code); rtx use_widen_ternop_insn (insn_code); rtx use_scalar_move_insn (insn_code); + rtx use_scalar_broadcast_insn (insn_code); rtx generate_insn (insn_code); /* The function call expression. */ diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 4d8170d..44ef44a 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -178,8 +178,8 @@ get_live_range (hash_map<tree, pair> *live_ranges, tree arg) STMT 5 (be vectorized) -- point 2 ... */ -static void -compute_local_program_points ( +void +costs::compute_local_program_points ( vec_info *vinfo, hash_map<basic_block, vec<stmt_point>> &program_points_per_bb) { @@ -274,14 +274,14 @@ loop_invariant_op_p (class loop *loop, /* Return true if the variable should be counted into liveness. */ static bool -variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var, - bool lhs_p) +variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, + slp_tree node, tree var, bool lhs_p) { if (!var) return false; gimple *stmt = STMT_VINFO_STMT (stmt_info); - enum stmt_vec_info_type type - = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)); + stmt_info = vect_stmt_to_vectorize (stmt_info); + enum stmt_vec_info_type type = SLP_TREE_TYPE (node); if (is_gimple_call (stmt) && gimple_call_internal_p (stmt)) { if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE @@ -357,8 +357,8 @@ variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var, The live range of SSA 1 is [1, 3] in bb 2. The live range of SSA 2 is [0, 4] in bb 3. */ -static machine_mode -compute_local_live_ranges ( +machine_mode +costs::compute_local_live_ranges ( loop_vec_info loop_vinfo, const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb, hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb) @@ -388,8 +388,11 @@ compute_local_live_ranges ( unsigned int point = program_point.point; gimple *stmt = program_point.stmt; tree lhs = gimple_get_lhs (stmt); - if (variable_vectorized_p (loop, program_point.stmt_info, lhs, - true)) + slp_tree *node = vinfo_slp_map.get (program_point.stmt_info); + if (!node) + continue; + if (variable_vectorized_p (loop, program_point.stmt_info, + *node, lhs, true)) { biggest_mode = get_biggest_mode (biggest_mode, TYPE_MODE (TREE_TYPE (lhs))); @@ -397,7 +400,7 @@ compute_local_live_ranges ( pair &live_range = live_ranges->get_or_insert (lhs, &existed_p); gcc_assert (!existed_p); - if (STMT_VINFO_MEMORY_ACCESS_TYPE (program_point.stmt_info) + if (SLP_TREE_MEMORY_ACCESS_TYPE (*node) == VMAT_LOAD_STORE_LANES) point = get_first_lane_point (program_points, program_point.stmt_info); @@ -406,8 +409,8 @@ compute_local_live_ranges ( for (i = 0; i < gimple_num_args (stmt); i++) { tree var = gimple_arg (stmt, i); - if (variable_vectorized_p (loop, program_point.stmt_info, var, - false)) + if (variable_vectorized_p (loop, program_point.stmt_info, + *node, var, false)) { biggest_mode = get_biggest_mode (biggest_mode, @@ -415,8 +418,7 @@ compute_local_live_ranges ( bool existed_p = false; pair &live_range = live_ranges->get_or_insert (var, &existed_p); - if (STMT_VINFO_MEMORY_ACCESS_TYPE ( - program_point.stmt_info) + if (SLP_TREE_MEMORY_ACCESS_TYPE (*node) == VMAT_LOAD_STORE_LANES) point = get_last_lane_point (program_points, program_point.stmt_info); @@ -597,15 +599,15 @@ get_store_value (gimple *stmt) } /* Return true if additional vector vars needed. */ -static bool -need_additional_vector_vars_p (stmt_vec_info stmt_info) +bool +costs::need_additional_vector_vars_p (stmt_vec_info stmt_info, + slp_tree node) { - enum stmt_vec_info_type type - = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)); + enum stmt_vec_info_type type = SLP_TREE_TYPE (node); if (type == load_vec_info_type || type == store_vec_info_type) { if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) return true; machine_mode mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); @@ -657,8 +659,8 @@ compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode) Then, after this function, we update SSA 1 live range in bb 2 into [2, 4] since SSA 1 is live out into bb 3. */ -static void -update_local_live_ranges ( +void +costs::update_local_live_ranges ( vec_info *vinfo, hash_map<basic_block, vec<stmt_point>> &program_points_per_bb, hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb, @@ -685,8 +687,13 @@ update_local_live_ranges ( { gphi *phi = psi.phi (); stmt_vec_info stmt_info = vinfo->lookup_stmt (phi); - if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)) - == undef_vec_info_type) + stmt_info = vect_stmt_to_vectorize (stmt_info); + slp_tree *node = vinfo_slp_map.get (stmt_info); + + if (!node) + continue; + + if (SLP_TREE_TYPE (*node) == undef_vec_info_type) continue; for (j = 0; j < gimple_phi_num_args (phi); j++) @@ -761,9 +768,12 @@ update_local_live_ranges ( if (!is_gimple_assign_or_call (gsi_stmt (si))) continue; stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); - enum stmt_vec_info_type type - = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)); - if (need_additional_vector_vars_p (stmt_info)) + stmt_info = vect_stmt_to_vectorize (stmt_info); + slp_tree *node = vinfo_slp_map.get (stmt_info); + if (!node) + continue; + enum stmt_vec_info_type type = SLP_TREE_TYPE (*node); + if (need_additional_vector_vars_p (stmt_info, *node)) { /* For non-adjacent load/store STMT, we will potentially convert it into: @@ -816,8 +826,8 @@ update_local_live_ranges ( } /* Compute the maximum live V_REGS. */ -static bool -has_unexpected_spills_p (loop_vec_info loop_vinfo) +bool +costs::has_unexpected_spills_p (loop_vec_info loop_vinfo) { /* Compute local program points. It's a fast and effective computation. */ @@ -899,7 +909,11 @@ costs::analyze_loop_vinfo (loop_vec_info loop_vinfo) /* Detect whether we're vectorizing for VLA and should apply the unrolling heuristic described above m_unrolled_vls_niters. */ record_potential_vls_unrolling (loop_vinfo); +} +void +costs::record_lmul_spills (loop_vec_info loop_vinfo) +{ /* Detect whether the LOOP has unexpected spills. */ record_potential_unexpected_spills (loop_vinfo); } @@ -1071,7 +1085,7 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const load/store. */ static int segment_loadstore_group_size (enum vect_cost_for_stmt kind, - stmt_vec_info stmt_info) + stmt_vec_info stmt_info, slp_tree node) { if (stmt_info && (kind == vector_load || kind == vector_store) @@ -1079,7 +1093,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind, { stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); if (stmt_info - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES) return DR_GROUP_SIZE (stmt_info); } return 0; @@ -1093,7 +1107,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind, unsigned costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, stmt_vec_info stmt_info, - slp_tree, tree vectype, int stmt_cost) + slp_tree node, tree vectype, int stmt_cost) { const cpu_vector_cost *costs = get_vector_costs (); switch (kind) @@ -1116,7 +1130,8 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, each vector in the group. Here we additionally add permute costs for each. */ /* TODO: Indexed and ordered/unordered cost. */ - int group_size = segment_loadstore_group_size (kind, stmt_info); + int group_size = segment_loadstore_group_size (kind, stmt_info, + node); if (group_size > 1) { switch (group_size) @@ -1239,8 +1254,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, int stmt_cost = targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign); + if (stmt_info && node) + vinfo_slp_map.put (stmt_info, node); + /* Do one-time initialization based on the vinfo. */ loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); + if (!m_analyzed_vinfo) { if (loop_vinfo) @@ -1326,6 +1345,8 @@ costs::finish_cost (const vector_costs *scalar_costs) { if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo)) { + record_lmul_spills (loop_vinfo); + adjust_vect_cost_per_loop (loop_vinfo); } vector_costs::finish_cost (scalar_costs); diff --git a/gcc/config/riscv/riscv-vector-costs.h b/gcc/config/riscv/riscv-vector-costs.h index de546a6..b84ceb1 100644 --- a/gcc/config/riscv/riscv-vector-costs.h +++ b/gcc/config/riscv/riscv-vector-costs.h @@ -91,7 +91,10 @@ private: typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; hash_set <tree_pair_hash> memrefs; + hash_map <stmt_vec_info, slp_tree> vinfo_slp_map; + void analyze_loop_vinfo (loop_vec_info); + void record_lmul_spills (loop_vec_info loop_vinfo); void record_potential_vls_unrolling (loop_vec_info); bool prefer_unrolled_loop () const; @@ -103,6 +106,19 @@ private: bool m_has_unexpected_spills_p = false; void record_potential_unexpected_spills (loop_vec_info); + void compute_local_program_points (vec_info *, + hash_map<basic_block, vec<stmt_point>> &); + void update_local_live_ranges (vec_info *, + hash_map<basic_block, vec<stmt_point>> &, + hash_map<basic_block, hash_map<tree, pair>> &, + machine_mode *); + machine_mode compute_local_live_ranges + (loop_vec_info, const hash_map<basic_block, vec<stmt_point>> &, + hash_map<basic_block, hash_map<tree, pair>> &); + + bool has_unexpected_spills_p (loop_vec_info); + bool need_additional_vector_vars_p (stmt_vec_info, slp_tree); + void adjust_vect_cost_per_loop (loop_vec_info); unsigned adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info, diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 1275b03..0a9fcef 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -170,7 +170,7 @@ struct GTY(()) riscv_frame_info { }; enum riscv_privilege_levels { - UNKNOWN_MODE, USER_MODE, SUPERVISOR_MODE, MACHINE_MODE + UNKNOWN_MODE, SUPERVISOR_MODE, MACHINE_MODE, RNMI_MODE }; struct GTY(()) mode_switching_info { @@ -3967,13 +3967,27 @@ get_vector_binary_rtx_cost (rtx x, int scalar2vr_cost) { gcc_assert (riscv_v_ext_mode_p (GET_MODE (x))); - rtx op_0 = XEXP (x, 0); - rtx op_1 = XEXP (x, 1); + rtx neg; + rtx op_0; + rtx op_1; + + if (GET_CODE (x) == UNSPEC) + { + op_0 = XVECEXP (x, 0, 0); + op_1 = XVECEXP (x, 0, 1); + } + else + { + op_0 = XEXP (x, 0); + op_1 = XEXP (x, 1); + } if (GET_CODE (op_0) == VEC_DUPLICATE || GET_CODE (op_1) == VEC_DUPLICATE) return (scalar2vr_cost + 1) * COSTS_N_INSNS (1); - else if (GET_CODE (op_0) == NEG && GET_CODE (op_1) == VEC_DUPLICATE) + else if (GET_CODE (neg = op_0) == NEG + && (GET_CODE (op_1) == VEC_DUPLICATE + || GET_CODE (XEXP (neg, 0)) == VEC_DUPLICATE)) return (scalar2vr_cost + 1) * COSTS_N_INSNS (1); else return COSTS_N_INSNS (1); @@ -4021,6 +4035,21 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN case SS_MINUS: *total = get_vector_binary_rtx_cost (op, scalar2vr_cost); break; + case UNSPEC: + { + switch (XINT (op, 1)) + { + case UNSPEC_VAADDU: + case UNSPEC_VAADD: + *total + = get_vector_binary_rtx_cost (op, scalar2vr_cost); + break; + default: + *total = COSTS_N_INSNS (1); + break; + } + } + break; default: *total = COSTS_N_INSNS (1); break; @@ -6896,12 +6925,18 @@ riscv_handle_type_attribute (tree *node ATTRIBUTE_UNUSED, tree name, tree args, } string = TREE_STRING_POINTER (cst); - if (strcmp (string, "user") && strcmp (string, "supervisor") - && strcmp (string, "machine")) + if (!strcmp (string, "rnmi") && !TARGET_SMRNMI) + { + error ("attribute 'rnmi' requires the Smrnmi ISA extension"); + *no_add_attrs = true; + } + else if (strcmp (string, "supervisor") + && strcmp (string, "machine") + && strcmp (string, "rnmi")) { warning (OPT_Wattributes, - "argument to %qE attribute is not %<\"user\"%>, %<\"supervisor\"%>, " - "or %<\"machine\"%>", name); + "argument to %qE attribute is not %<\"supervisor\"%>, " + "%<\"machine\"%>, or %<\"rnmi\"%>", name); *no_add_attrs = true; } } @@ -9049,7 +9084,7 @@ riscv_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size) /* We want the CFA independent of the stack pointer for the duration of the loop. */ add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, temp1, + plus_constant (Pmode, temp2, initial_cfa_offset + rounded_size)); RTX_FRAME_RELATED_P (insn) = 1; } @@ -9682,12 +9717,12 @@ riscv_expand_epilogue (int style) if (th_int_mask && TH_INT_INTERRUPT (cfun)) emit_jump_insn (gen_th_int_pop ()); - else if (mode == MACHINE_MODE) - emit_jump_insn (gen_riscv_mret ()); else if (mode == SUPERVISOR_MODE) emit_jump_insn (gen_riscv_sret ()); - else - emit_jump_insn (gen_riscv_uret ()); + else if (mode == RNMI_MODE) + emit_jump_insn (gen_riscv_mnret ()); + else /* Must be MACHINE_MODE. */ + emit_jump_insn (gen_riscv_mret ()); } else if (style != SIBCALL_RETURN) { @@ -10359,10 +10394,10 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr); bool sched1 = can_create_pseudo_p (); - unsigned int prev_dest_regno = (REG_P (SET_DEST (prev_set)) + unsigned int prev_dest_regno = (prev_set && REG_P (SET_DEST (prev_set)) ? REGNO (SET_DEST (prev_set)) : FIRST_PSEUDO_REGISTER); - unsigned int curr_dest_regno = (REG_P (SET_DEST (curr_set)) + unsigned int curr_dest_regno = (curr_set && REG_P (SET_DEST (curr_set)) ? REGNO (SET_DEST (curr_set)) : FIRST_PSEUDO_REGISTER); @@ -12029,10 +12064,10 @@ riscv_get_interrupt_type (tree decl) { const char *string = TREE_STRING_POINTER (TREE_VALUE (attr_args)); - if (!strcmp (string, "user")) - return USER_MODE; - else if (!strcmp (string, "supervisor")) + if (!strcmp (string, "supervisor")) return SUPERVISOR_MODE; + else if (!strcmp (string, "rnmi")) + return RNMI_MODE; else /* Must be "machine". */ return MACHINE_MODE; } @@ -12649,14 +12684,31 @@ riscv_estimated_poly_value (poly_int64 val, /* Return true if the vector misalignment factor is supported by the target. */ bool -riscv_support_vector_misalignment (machine_mode mode, - const_tree type ATTRIBUTE_UNUSED, - int misalignment, - bool is_packed ATTRIBUTE_UNUSED) +riscv_support_vector_misalignment (machine_mode mode, const_tree type, + int misalignment, bool is_packed, + bool is_gather_scatter) { - /* Depend on movmisalign pattern. */ + /* IS_PACKED is true if the corresponding scalar element is not naturally + aligned. If the misalignment is unknown and the the access is packed + we defer to the default hook which will check if movmisalign is present. + Movmisalign, in turn, depends on TARGET_VECTOR_MISALIGN_SUPPORTED. */ + if (misalignment == DR_MISALIGNMENT_UNKNOWN) + { + if (!is_packed) + return true; + } + else + { + /* If we know that misalignment is a multiple of the element size, we're + good. */ + if (misalignment % TYPE_ALIGN_UNIT (type) == 0) + return true; + } + + /* Otherwise fall back to movmisalign again. */ return default_builtin_support_vector_misalignment (mode, type, misalignment, - is_packed); + is_packed, + is_gather_scatter); } /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index c3b504d..578dd43 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -120,7 +120,7 @@ ;; Interrupt handler instructions. UNSPECV_MRET UNSPECV_SRET - UNSPECV_URET + UNSPECV_MNRET ;; Blockage and synchronization. UNSPECV_BLOCKAGE @@ -4166,11 +4166,11 @@ "sret" [(set_attr "type" "ret")]) -(define_insn "riscv_uret" +(define_insn "riscv_mnret" [(return) - (unspec_volatile [(const_int 0)] UNSPECV_URET)] - "" - "uret" + (unspec_volatile [(const_int 0)] UNSPECV_MNRET)] + "TARGET_SMRNMI" + "mnret" [(set_attr "type" "ret")]) (define_insn "stack_tie<mode>" diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv index 7aac56a..a7eaa8b 100644 --- a/gcc/config/riscv/t-riscv +++ b/gcc/config/riscv/t-riscv @@ -229,8 +229,41 @@ s-riscv-ext.texi: build/gen-riscv-ext-texi$(build_exeext) $(SHELL) $(srcdir)/../move-if-change tmp-riscv-ext.texi $(srcdir)/doc/riscv-ext.texi $(STAMP) s-riscv-ext.texi -# Run `riscv-regen' after you changed or added anything from riscv-ext*.def +RISCV_CORES_DEFS = \ + $(srcdir)/config/riscv/riscv-cores.def + +build/gen-riscv-mtune-texi.o: $(srcdir)/config/riscv/gen-riscv-mtune-texi.cc \ + $(RISCV_CORES_DEFS) + $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@ + +build/gen-riscv-mcpu-texi.o: $(srcdir)/config/riscv/gen-riscv-mcpu-texi.cc \ + $(RISCV_CORES_DEFS) + $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@ + +build/gen-riscv-mtune-texi$(build_exeext): build/gen-riscv-mtune-texi.o + $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $< + +build/gen-riscv-mcpu-texi$(build_exeext): build/gen-riscv-mcpu-texi.o + $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $< + +$(srcdir)/doc/riscv-mtune.texi: $(RISCV_CORES_DEFS) +$(srcdir)/doc/riscv-mtune.texi: s-riscv-mtune.texi ; @true + +$(srcdir)/doc/riscv-mcpu.texi: $(RISCV_CORES_DEFS) +$(srcdir)/doc/riscv-mcpu.texi: s-riscv-mcpu.texi ; @true + +s-riscv-mtune.texi: build/gen-riscv-mtune-texi$(build_exeext) + $(RUN_GEN) build/gen-riscv-mtune-texi$(build_exeext) > tmp-riscv-mtune.texi + $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mtune.texi $(srcdir)/doc/riscv-mtune.texi + $(STAMP) s-riscv-mtune.texi + +s-riscv-mcpu.texi: build/gen-riscv-mcpu-texi$(build_exeext) + $(RUN_GEN) build/gen-riscv-mcpu-texi$(build_exeext) > tmp-riscv-mcpu.texi + $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mcpu.texi $(srcdir)/doc/riscv-mcpu.texi + $(STAMP) s-riscv-mcpu.texi + +# Run `riscv-regen' after you changed or added anything from riscv-ext*.def and riscv-cores*.def .PHONY: riscv-regen -riscv-regen: s-riscv-ext.texi s-riscv-ext.opt +riscv-regen: s-riscv-ext.texi s-riscv-ext.opt s-riscv-mtune.texi s-riscv-mcpu.texi diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md index 5f6cc42..aa3b6fb 100644 --- a/gcc/config/riscv/vector-iterators.md +++ b/gcc/config/riscv/vector-iterators.md @@ -4013,6 +4013,14 @@ UNSPEC_VASUBU UNSPEC_VASUB UNSPEC_VSMUL UNSPEC_VSSRL UNSPEC_VSSRA]) +(define_int_iterator VSAT_VX_OP_V_VDUP [ + UNSPEC_VAADDU UNSPEC_VAADD +]) + +(define_int_iterator VSAT_VX_OP_VDUP_V [ + UNSPEC_VAADDU UNSPEC_VAADD +]) + (define_int_iterator VSAT_ARITH_OP [UNSPEC_VAADDU UNSPEC_VAADD UNSPEC_VASUBU UNSPEC_VASUB UNSPEC_VSMUL]) (define_int_iterator VSAT_SHIFT_OP [UNSPEC_VSSRL UNSPEC_VSSRA]) @@ -4047,6 +4055,14 @@ (UNSPEC_VSSRA "vsshift") (UNSPEC_VNCLIP "vnclip") (UNSPEC_VNCLIPU "vnclip")]) +(define_int_attr sat_op_v_vdup [ + (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd") +]) + +(define_int_attr sat_op_vdup_v [ + (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd") +]) + (define_int_attr misc_op [(UNSPEC_VMSBF "sbf") (UNSPEC_VMSIF "sif") (UNSPEC_VMSOF "sof") (UNSPEC_VFRSQRT7 "rsqrt7")]) diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index baf215b..66b7670 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -1551,20 +1551,44 @@ (define_expand "vec_duplicate<mode>" [(set (match_operand:V_VLS 0 "register_operand") (vec_duplicate:V_VLS - (match_operand:<VEL> 1 "direct_broadcast_operand")))] + (match_operand:<VEL> 1 "any_broadcast_operand")))] "TARGET_VECTOR" { - /* Early expand DImode broadcast in RV32 system to avoid RA reload - generate (set (reg) (vec_duplicate:DI)). */ + /* Don't keep a DImode broadcast for RV32 in the vec_duplicate form. + Otherwise combine or late combine could end up doing + "64-bit broadcast" (!= vmv.v.x) + + vadd.vv + = vadd.vx + which would be invalid. */ bool gt_p = maybe_gt (GET_MODE_SIZE (<VEL>mode), GET_MODE_SIZE (Pmode)); if (!FLOAT_MODE_P (<VEL>mode) && gt_p) { - riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), - riscv_vector::UNARY_OP, operands); - DONE; + riscv_vector::emit_vlmax_insn + (code_for_pred_strided_broadcast + (<MODE>mode), riscv_vector::UNARY_OP, operands); + DONE; } - /* Otherwise, allow it fall into general vec_duplicate pattern - which allow us to have vv->vx combine optimization in later pass. */ + + /* Even though we can eventually broadcast any permissible + constant by moving it into a register we need to force + any non-immediate one into a register here. + If we didn't do that we couldn't fwprop/late-combine + vec_duplicate 123.45f + + vfadd.vv + = vfadd.vf + because the constant is valid for vec_duplicate but not + for vfadd.vf. Therefore we need to do + fa0 = 123.45f + vec_duplicate fa0 + + vfadd.vv + = vfadd.vf */ + if (!satisfies_constraint_P (operands[1]) + && !satisfies_constraint_J (operands[1]) + && !rtx_equal_p (operands[1], CONST0_RTX (<VEL>mode)) + && !memory_operand (operands[1], <VEL>mode)) + operands[1] = force_reg (<VEL>mode, operands[1]); + + /* Otherwise keep the vec_duplicate pattern until split. */ }) ;; According to GCC internal: @@ -1574,28 +1598,20 @@ (define_insn_and_split "*vec_duplicate<mode>" [(set (match_operand:V_VLS 0 "register_operand") (vec_duplicate:V_VLS - (match_operand:<VEL> 1 "direct_broadcast_operand")))] + (match_operand:<VEL> 1 "any_broadcast_operand")))] "TARGET_VECTOR && can_create_pseudo_p ()" "#" "&& 1" [(const_int 0)] { - if (!strided_load_broadcast_p () - && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode) - { - /* For Float16, reinterpret as HImode, broadcast and reinterpret - back. */ - poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode); - machine_mode vmodehi - = riscv_vector::get_vector_mode (HImode, nunits).require (); - rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode), - lowpart_subreg (HImode, operands[1], HFmode)}; - riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi), - riscv_vector::UNARY_OP, ops); - } - else + if (riscv_vector::can_be_broadcast_p (operands[1])) riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), riscv_vector::UNARY_OP, operands); + else + riscv_vector::emit_vlmax_insn (code_for_pred_strided_broadcast + (<MODE>mode), riscv_vector::UNARY_OP, + operands); + DONE; } [(set_attr "type" "vector")] @@ -2141,69 +2157,45 @@ (match_operand:V_VLS 2 "vector_merge_operand")))] "TARGET_VECTOR" { - /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f - has better chances to do vsetvl fusion in vsetvl pass. */ bool wrap_vec_dup = true; rtx vec_cst = NULL_RTX; - if (riscv_vector::splat_to_scalar_move_p (operands)) - { - operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode); - operands[3] = force_reg (<VEL>mode, operands[3]); - } - else if (immediate_operand (operands[3], <VEL>mode) - && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3])) - && (/* -> pred_broadcast<mode>_zero */ - (vector_least_significant_set_mask_operand (operands[1], - <VM>mode) - && vector_const_0_operand (vec_cst, <MODE>mode)) - || (/* pred_broadcast<mode>_imm */ - vector_all_trues_mask_operand (operands[1], <VM>mode) - && vector_const_int_or_double_0_operand (vec_cst, - <MODE>mode)))) + if (immediate_operand (operands[3], <VEL>mode) + && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3])) + && (/* -> pred_broadcast<mode>_zero */ + (vector_least_significant_set_mask_operand (operands[1], + <VM>mode) + && vector_const_0_operand (vec_cst, <MODE>mode)) + || (/* pred_broadcast<mode>_imm */ + vector_all_trues_mask_operand (operands[1], <VM>mode) + && vector_const_int_or_double_0_operand (vec_cst, + <MODE>mode)))) { operands[3] = vec_cst; wrap_vec_dup = false; } - /* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar. */ - else if (satisfies_constraint_Wdm (operands[3])) - { - if (satisfies_constraint_Wb1 (operands[1])) - { - /* Case 1: vmv.s.x (TA, x == memory) ==> vlse.v (TA) */ - if (satisfies_constraint_vu (operands[2])) - operands[1] = CONSTM1_RTX (<VM>mode); - else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)) - { - /* Case 2: vmv.s.x (TU, x == memory) ==> - vl = 0 or 1; + vlse.v (TU) in RV32 system */ - operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]); - operands[1] = CONSTM1_RTX (<VM>mode); - } - else - /* Case 3: load x (memory) to register. */ - operands[3] = force_reg (<VEL>mode, operands[3]); - } - } - else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode) - && (immediate_operand (operands[3], Pmode) + else if (GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD + && satisfies_constraint_Wb1 (operands[1]) + && (immediate_operand (operands[3], Xmode) || (CONST_POLY_INT_P (operands[3]) && known_ge (rtx_to_poly_int64 (operands[3]), 0U) - && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode))))) + && known_le (rtx_to_poly_int64 (operands[3]), + GET_MODE_SIZE (<MODE>mode))))) { rtx tmp = gen_reg_rtx (Pmode); poly_int64 value = rtx_to_poly_int64 (operands[3]); - emit_move_insn (tmp, gen_int_mode (value, Pmode)); + emit_move_insn (tmp, gen_int_mode (value, Xmode)); operands[3] = gen_rtx_SIGN_EXTEND (<VEL>mode, tmp); } - /* Never load (const_int 0) into a register, that's silly. */ - else if (operands[3] == CONST0_RTX (<VEL>mode)) + + /* For a vmv.v.x never load (const_int 0) or valid immediate operands + into a register, because we can use vmv.v.i. */ + else if (satisfies_constraint_Wc1 (operands[1]) + && (satisfies_constraint_P (operands[3]) + || operands[3] == CONST0_RTX (<VEL>mode))) ; - /* If we're broadcasting [-16..15] across more than just - element 0, then we can use vmv.v.i directly, thus avoiding - the load of the constant into a GPR. */ - else if (CONST_INT_P (operands[3]) - && IN_RANGE (INTVAL (operands[3]), -16, 15) - && !satisfies_constraint_Wb1 (operands[1])) + /* For vmv.s.x we have vmv.s.x v1, zero. */ + else if (satisfies_constraint_Wb1 (operands[1]) + && operands[3] == CONST0_RTX (<VEL>mode)) ; else operands[3] = force_reg (<VEL>mode, operands[3]); @@ -2211,131 +2203,68 @@ operands[3] = gen_rtx_VEC_DUPLICATE (<MODE>mode, operands[3]); }) -(define_insn_and_split "*pred_broadcast<mode>" - [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vd, vd, vr, vr, vr, vr") +(define_insn_and_rewrite "*pred_broadcast<mode>" + [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vr, vr") (if_then_else:V_VLSI (unspec:<VM> - [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1, vm, vm,Wc1,Wc1,Wb1,Wb1") - (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl,rvl,rvl,rvl,rvl") - (match_operand 5 "const_int_operand" " i, i, i, i, i, i, i, i") - (match_operand 6 "const_int_operand" " i, i, i, i, i, i, i, i") - (match_operand 7 "const_int_operand" " i, i, i, i, i, i, i, i") + [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1") + (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl") + (match_operand 5 "const_int_operand" " i, i, i, i") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) (vec_duplicate:V_VLSI - (match_operand:<VEL> 3 "direct_broadcast_operand" "rP,rP,Wdm,Wdm,Wdm,Wdm, rJ, rJ")) - (match_operand:V_VLSI 2 "vector_merge_operand" "vu, 0, vu, 0, vu, 0, vu, 0")))] + (match_operand:<VEL> 3 "direct_broadcast_operand" " rP, rP, rJ, rJ")) + (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" "@ vmv.v.%o3\t%0,%3 vmv.v.%o3\t%0,%3 - vlse<sew>.v\t%0,%3,zero,%1.t - vlse<sew>.v\t%0,%3,zero,%1.t - vlse<sew>.v\t%0,%3,zero - vlse<sew>.v\t%0,%3,zero vmv.s.x\t%0,%z3 vmv.s.x\t%0,%z3" - "(register_operand (operands[3], <VEL>mode) - || CONST_POLY_INT_P (operands[3])) - && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)" - [(const_int 0)] - { - gcc_assert (can_create_pseudo_p ()); - if (CONST_POLY_INT_P (operands[3])) - { - rtx tmp = gen_reg_rtx (<VEL>mode); - emit_move_insn (tmp, operands[3]); - operands[3] = tmp; - } - - /* For SEW = 64 in RV32 system, we expand vmv.s.x: - andi a2,a2,1 - vsetvl zero,a2,e64 - vlse64.v */ - if (satisfies_constraint_Wb1 (operands[1])) - { - operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]); - operands[1] = CONSTM1_RTX (<VM>mode); - } - - /* If the target doesn't want a strided-load broadcast we go with a regular - V1DImode load and a broadcast gather. */ - if (strided_load_broadcast_p ()) - { - rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode), - GET_MODE_ALIGNMENT (<VEL>mode)); - mem = validize_mem (mem); - emit_move_insn (mem, operands[3]); - mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0))); - - emit_insn - (gen_pred_broadcast<mode> - (operands[0], operands[1], operands[2], mem, - operands[4], operands[5], operands[6], operands[7])); - } - else - { - rtx tmp = gen_reg_rtx (V1DImode); - emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3], - <VEL>mode)); - tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode); - - emit_insn - (gen_pred_gather<mode>_scalar - (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode), - operands[4], operands[5], operands[6], operands[7])); - } - DONE; - } - [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv") + "&& (operands[1] == CONSTM1_RTX (<VM>mode) + && operands[4] == CONST1_RTX (Pmode) + && (register_operand (operands[3], <VEL>mode) + || satisfies_constraint_J (operands[3])))" +{ + /* A broadcast of a single element is just a vmv.s.x. */ + operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode); +} + [(set_attr "type" "vimov,vimov,vimovxv,vimovxv") (set_attr "mode" "<MODE>")]) -(define_insn "*pred_broadcast<mode>_zvfh" - [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr") +(define_insn_and_rewrite "pred_broadcast<mode>_zvfh" + [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr") (if_then_else:V_VLSF (unspec:<VM> - [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1, Wc1, Wb1, Wb1") - (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl") - (match_operand 5 "const_int_operand" " i, i, i, i") - (match_operand 6 "const_int_operand" " i, i, i, i") - (match_operand 7 "const_int_operand" " i, i, i, i") + [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1") + (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl") + (match_operand 5 "const_int_operand" " i, i, i, i") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) (vec_duplicate:V_VLSF - (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f")) - (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))] + (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f")) + (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" "@ vfmv.v.f\t%0,%3 vfmv.v.f\t%0,%3 vfmv.s.f\t%0,%3 vfmv.s.f\t%0,%3" + "&& (operands[1] == CONSTM1_RTX (<VM>mode) + && operands[4] == CONST1_RTX (Pmode) + && (register_operand (operands[3], <VEL>mode) + || satisfies_constraint_J (operands[3])))" +{ + /* A broadcast of a single element is just a vfmv.s.f. */ + operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode); +} [(set_attr "type" "vfmov,vfmov,vfmovfv,vfmovfv") (set_attr "mode" "<MODE>")]) -(define_insn "*pred_broadcast<mode>_zvfhmin" - [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr") - (if_then_else:V_VLSF_ZVFHMIN - (unspec:<VM> - [(match_operand:<VM> 1 "vector_broadcast_mask_operand" " vm, vm, Wc1, Wc1") - (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl") - (match_operand 5 "const_int_operand" " i, i, i, i") - (match_operand 6 "const_int_operand" " i, i, i, i") - (match_operand 7 "const_int_operand" " i, i, i, i") - (reg:SI VL_REGNUM) - (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) - (vec_duplicate:V_VLSF_ZVFHMIN - (match_operand:<VEL> 3 "direct_broadcast_operand" " A, A, A, A")) - (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))] - "TARGET_VECTOR && strided_load_broadcast_p ()" - "@ - vlse<sew>.v\t%0,%3,zero,%1.t - vlse<sew>.v\t%0,%3,zero,%1.t - vlse<sew>.v\t%0,%3,zero - vlse<sew>.v\t%0,%3,zero" - [(set_attr "type" "vlds,vlds,vlds,vlds") - (set_attr "mode" "<MODE>")]) - (define_insn "*pred_broadcast<mode>_extended_scalar" [(set (match_operand:V_VLSI_D 0 "register_operand" "=vr, vr, vr, vr") (if_then_else:V_VLSI_D @@ -2398,6 +2327,117 @@ [(set_attr "type" "vimov,vimov") (set_attr "mode" "<MODE>")]) +(define_expand "@pred_strided_broadcast<mode>" + [(set (match_operand:V_VLS 0 "register_operand") + (if_then_else:V_VLS + (unspec:<VM> + [(match_operand:<VM> 1 "strided_broadcast_mask_operand") + (match_operand 4 "vector_length_operand") + (match_operand 5 "const_int_operand") + (match_operand 6 "const_int_operand") + (match_operand 7 "const_int_operand") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (vec_duplicate:V_VLS + (match_operand:<VEL> 3 "strided_broadcast_operand")) + (match_operand:V_VLS 2 "vector_merge_operand")))] + "TARGET_VECTOR" +{ + if (satisfies_constraint_Wb1 (operands[1])) + { + /* If we're asked to set a single element (like vmv.s.x but we + need to go via memory here) and the tail policy is agnostic + we can overwrite all elements. + Thus, set the mask to broadcast. */ + operands[1] = CONSTM1_RTX (<VM>mode); + if (!satisfies_constraint_vu (operands[2]) + && GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD) + { + /* Case 2: vmv.s.x (TU, x == memory) ==> + vl = 0 or 1; + vlse.v (TU) in RV32 system */ + /* In this case we must not overwrite the residual elements, + so set the vector length to 0/1. */ + operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]); + } + } +}) + +(define_insn_and_split "*pred_strided_broadcast<mode>" + [(set (match_operand:V_VLSI 0 "register_operand" "=vd, vd, vr, vr") + (if_then_else:V_VLSI + (unspec:<VM> + [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm,Wc1,Wc1") + (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl") + (match_operand 5 "const_int_operand" " i, i, i, i") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (vec_duplicate:V_VLSI + (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A")) + (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))] + "TARGET_VECTOR" + "@ + vlse<sew>.v\t%0,%3,zero,%1.t + vlse<sew>.v\t%0,%3,zero,%1.t + vlse<sew>.v\t%0,%3,zero + vlse<sew>.v\t%0,%3,zero" + "&& !strided_load_broadcast_p () && can_create_pseudo_p ()" + [(const_int 0)] + { + rtx tmp = gen_reg_rtx (V1DImode); + emit_move_insn (tmp, gen_lowpart (V1DImode, operands[3])); + tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode); + + emit_insn + (gen_pred_gather<mode>_scalar + (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode), + operands[4], operands[5], operands[6], operands[7])); + DONE; + } + [(set_attr "type" "vlds,vlds,vlds,vlds") + (set_attr "mode" "<MODE>")]) + +(define_insn_and_split "*pred_strided_broadcast<mode>_zvfhmin" + [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr") + (if_then_else:V_VLSF_ZVFHMIN + (unspec:<VM> + [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm, Wc1, Wc1") + (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl") + (match_operand 5 "const_int_operand" " i, i, i, i") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (vec_duplicate:V_VLSF_ZVFHMIN + (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A")) + (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))] + "TARGET_VECTOR" + "@ + vlse<sew>.v\t%0,%3,zero,%1.t + vlse<sew>.v\t%0,%3,zero,%1.t + vlse<sew>.v\t%0,%3,zero + vlse<sew>.v\t%0,%3,zero" + "&& !strided_load_broadcast_p () + && <VEL>mode == HFmode + && can_create_pseudo_p ()" + [(const_int 0)] + { + poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode); + machine_mode vmodehi + = riscv_vector::get_vector_mode (HImode, nunits).require (); + rtx ops[] = {gen_lowpart (vmodehi, operands[0]), + gen_lowpart (HImode, operands[3])}; + riscv_vector::emit_avltype_insn (code_for_pred_broadcast (vmodehi), + riscv_vector::UNARY_OP, ops, + (riscv_vector::avl_type) INTVAL (operands[7]), + operands[4]); + DONE; + } + [(set_attr "type" "vlds,vlds,vlds,vlds") + (set_attr "mode" "<MODE>")]) + + ;; ------------------------------------------------------------------------------- ;; ---- Predicated Strided loads/stores ;; ------------------------------------------------------------------------------- @@ -4639,8 +4679,8 @@ ;; Handle GET_MODE_INNER (mode) = DImode. We need to split them since ;; we need to deal with SEW = 64 in RV32 system. (define_expand "@pred_<sat_op><mode>_scalar" - [(set (match_operand:VI_D 0 "register_operand") - (if_then_else:VI_D + [(set (match_operand:V_VLSI_D 0 "register_operand") + (if_then_else:V_VLSI_D (unspec:<VM> [(match_operand:<VM> 1 "vector_mask_operand") (match_operand 5 "vector_length_operand") @@ -4651,10 +4691,10 @@ (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM) (reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE) - (unspec:VI_D - [(match_operand:VI_D 3 "register_operand") + (unspec:V_VLSI_D + [(match_operand:V_VLSI_D 3 "register_operand") (match_operand:<VEL> 4 "reg_or_int_operand")] VSAT_ARITH_OP) - (match_operand:VI_D 2 "vector_merge_operand")))] + (match_operand:V_VLSI_D 2 "vector_merge_operand")))] "TARGET_VECTOR" { if (riscv_vector::sew64_scalar_helper ( @@ -4673,8 +4713,8 @@ }) (define_insn "*pred_<sat_op><mode>_scalar" - [(set (match_operand:VI_D 0 "register_operand" "=vd, vr, vd, vr") - (if_then_else:VI_D + [(set (match_operand:V_VLSI_D 0 "register_operand" "=vd, vr, vd, vr") + (if_then_else:V_VLSI_D (unspec:<VM> [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1") (match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl") @@ -4685,18 +4725,18 @@ (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM) (reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE) - (unspec:VI_D - [(match_operand:VI_D 3 "register_operand" " vr, vr, vr, vr") + (unspec:V_VLSI_D + [(match_operand:V_VLSI_D 3 "register_operand" " vr, vr, vr, vr") (match_operand:<VEL> 4 "reg_or_0_operand" " rJ, rJ, rJ, rJ")] VSAT_ARITH_OP) - (match_operand:VI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))] + (match_operand:V_VLSI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" "v<sat_op>.vx\t%0,%3,%z4%p1" [(set_attr "type" "<sat_insn_type>") (set_attr "mode" "<MODE>")]) (define_insn "*pred_<sat_op><mode>_extended_scalar" - [(set (match_operand:VI_D 0 "register_operand" "=vd, vr, vd, vr") - (if_then_else:VI_D + [(set (match_operand:V_VLSI_D 0 "register_operand" "=vd, vr, vd, vr") + (if_then_else:V_VLSI_D (unspec:<VM> [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1") (match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl") @@ -4707,11 +4747,11 @@ (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM) (reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE) - (unspec:VI_D - [(match_operand:VI_D 3 "register_operand" " vr, vr, vr, vr") + (unspec:V_VLSI_D + [(match_operand:V_VLSI_D 3 "register_operand" " vr, vr, vr, vr") (sign_extend:<VEL> (match_operand:<VSUBEL> 4 "reg_or_0_operand" " rJ, rJ, rJ, rJ"))] VSAT_ARITH_OP) - (match_operand:VI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))] + (match_operand:V_VLSI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR && !TARGET_64BIT" "v<sat_op>.vx\t%0,%3,%z4%p1" [(set_attr "type" "<sat_insn_type>") diff --git a/gcc/config/riscv/xiangshan.md b/gcc/config/riscv/xiangshan.md index 5ed6bac..34b4a8f 100644 --- a/gcc/config/riscv/xiangshan.md +++ b/gcc/config/riscv/xiangshan.md @@ -107,7 +107,8 @@ ;; they are just dummies like this one. (define_insn_reservation "xiangshan_alu_unknown" 1 (and (eq_attr "tune" "xiangshan") - (eq_attr "type" "zicond,min,max,minu,maxu,clz,ctz,cpop,ghost,rotate,clmul,condmove,crypto,mvpair,rdvlenb,rdvl,wrvxrm,wrfrm,rdfrm,vsetvl,vsetvl_pre,vlde,vste,vldm,vstm,vlds,vsts,vldux,vldox,vstux,vstox,vldff,vldr,vstr,vlsegde,vssegte,vlsegds,vssegts,vlsegdux,vlsegdox,vssegtux,vssegtox,vlsegdff,vialu,viwalu,vext,vicalu,vshift,vnshift,vicmp,viminmax,vimul,vidiv,viwmul,vimuladd,sf_vqmacc,viwmuladd,vimerge,vimov,vsalu,vaalu,vsmul,vsshift,vnclip,sf_vfnrclip,vfalu,vfwalu,vfmul,vfdiv,vfwmul,vfmuladd,vfwmuladd,vfsqrt,vfrecp,vfcmp,vfminmax,vfsgnj,vfclass,vfmerge,vfmov,vfcvtitof,vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,vfncvtftoi,vfncvtftof,vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,vfmovvf,vfmovfv,vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16")) + (eq_attr "type" "zicond,min,max,minu,maxu,clz,ctz,cpop,ghost,rotate,clmul,condmove,crypto,mvpair,rdvlenb,rdvl,wrvxrm,wrfrm,rdfrm,vsetvl,vsetvl_pre,vlde,vste,vldm,vstm,vlds,vsts,vldux,vldox,vstux,vstox,vldff,vldr,vstr,vlsegde,vssegte,vlsegds,vssegts,vlsegdux,vlsegdox,vssegtux,vssegtox,vlsegdff,vialu,viwalu,vext,vicalu,vshift,vnshift,vicmp,viminmax,vimul,vidiv,viwmul,vimuladd,sf_vqmacc,viwmuladd,vimerge,vimov,vsalu,vaalu,vsmul,vsshift,vnclip,sf_vfnrclip,vfalu,vfwalu,vfmul,vfdiv,vfwmul,vfmuladd,vfwmuladd,vfsqrt,vfrecp,vfcmp,vfminmax,vfsgnj,vfclass,vfmerge,vfmov,vfcvtitof,vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,vfncvtftoi,vfncvtftof,vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,vfmovvf,vfmovfv,vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16,sf_vc,sf_vc_se")) + "xs_alu_rs") ;; ---------------------------------------------------- diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 7ee26e5..764b499 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -4951,10 +4951,19 @@ static bool rs6000_builtin_support_vector_misalignment (machine_mode mode, const_tree type, int misalignment, - bool is_packed) + bool is_packed, + bool is_gather_scatter) { if (TARGET_VSX) { + if (is_gather_scatter) + { + if (TARGET_ALTIVEC && is_packed) + return false; + else + return true; + } + if (TARGET_EFFICIENT_UNALIGNED_VSX) return true; @@ -5165,6 +5174,7 @@ public: protected: void update_target_cost_per_stmt (vect_cost_for_stmt, stmt_vec_info, + slp_tree node, vect_cost_model_location, unsigned int); void density_test (loop_vec_info); void adjust_vect_cost_per_loop (loop_vec_info); @@ -5312,6 +5322,7 @@ rs6000_adjust_vect_cost_per_stmt (enum vect_cost_for_stmt kind, void rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind, stmt_vec_info stmt_info, + slp_tree node, vect_cost_model_location where, unsigned int orig_count) { @@ -5372,12 +5383,12 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind, or may not need to apply. When finalizing the cost of the loop, the extra penalty is applied when the load density heuristics are satisfied. */ - if (kind == vec_construct && stmt_info - && STMT_VINFO_TYPE (stmt_info) == load_vec_info_type - && (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE - || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_STRIDED_SLP)) + if (kind == vec_construct && node + && SLP_TREE_TYPE (node) == load_vec_info_type + && (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE + || SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP)) { - tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree vectype = SLP_TREE_VECTYPE (node); unsigned int nunits = vect_nunits_for_cost (vectype); /* As PR103702 shows, it's possible that vectorizer wants to do costings for only one unit here, it's no need to do any @@ -5406,7 +5417,7 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind, unsigned rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, slp_tree, + stmt_vec_info stmt_info, slp_tree node, tree vectype, int misalign, vect_cost_model_location where) { @@ -5424,7 +5435,7 @@ rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind, retval = adjust_cost_for_freq (stmt_info, where, count * stmt_cost); m_costs[where] += retval; - update_target_cost_per_stmt (kind, stmt_info, where, orig_count); + update_target_cost_per_stmt (kind, stmt_info, node, where, orig_count); } return retval; @@ -10309,15 +10320,18 @@ can_be_rotated_to_negative_lis (HOST_WIDE_INT c, int *rot) /* case b. xx0..01..1xx: some of 15 x's (and some of 16 0's) are rotated over the highest bit. */ - int pos_one = clz_hwi ((c << 16) >> 16); - middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one)); - int middle_ones = clz_hwi (~(c << pos_one)); - if (middle_zeros >= 16 && middle_ones >= 33) + unsigned HOST_WIDE_INT uc = c; + int pos_one = clz_hwi ((HOST_WIDE_INT) (uc << 16) >> 16); + if (pos_one != 0) { - *rot = pos_one; - return true; + middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one)); + int middle_ones = clz_hwi (~(uc << pos_one)); + if (middle_zeros >= 16 && middle_ones >= 33) + { + *rot = pos_one; + return true; + } } - return false; } @@ -10434,7 +10448,8 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask) if (lz >= HOST_BITS_PER_WIDE_INT) return false; - int middle_ones = clz_hwi (~(c << lz)); + unsigned HOST_WIDE_INT uc = c; + int middle_ones = clz_hwi (~(uc << lz)); if (tz + lz + middle_ones >= ones && (tz - lz) < HOST_BITS_PER_WIDE_INT && tz < HOST_BITS_PER_WIDE_INT) @@ -10468,7 +10483,7 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask) if (!IN_RANGE (pos_first_1, 1, HOST_BITS_PER_WIDE_INT-1)) return false; - middle_ones = clz_hwi (~c << pos_first_1); + middle_ones = clz_hwi ((~(unsigned HOST_WIDE_INT) c) << pos_first_1); middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_first_1)); if (pos_first_1 < HOST_BITS_PER_WIDE_INT && middle_ones + middle_zeros < HOST_BITS_PER_WIDE_INT @@ -10570,7 +10585,8 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c, int *num_insns) { /* li/lis; rldicX */ unsigned HOST_WIDE_INT imm = (c | ~mask); - imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift)); + if (shift != 0) + imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift)); count_or_emit_insn (temp, GEN_INT (imm)); if (shift != 0) diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 9c718ca..e31ee40 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -1969,7 +1969,7 @@ [(set (match_dup 0) (plus:GPR (match_dup 1) (match_dup 3))) (set (match_dup 0) (plus:GPR (match_dup 0) (match_dup 4)))] { - HOST_WIDE_INT val = INTVAL (operands[2]); + unsigned HOST_WIDE_INT val = UINTVAL (operands[2]); HOST_WIDE_INT low = sext_hwi (val, 16); HOST_WIDE_INT rest = trunc_int_for_mode (val - low, <MODE>mode); diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h index d760a7e..6becad1 100644 --- a/gcc/config/s390/s390-protos.h +++ b/gcc/config/s390/s390-protos.h @@ -128,6 +128,8 @@ extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx); extern void s390_expand_vec_init (rtx, rtx); extern rtx s390_expand_merge_perm_const (machine_mode, bool); extern void s390_expand_merge (rtx, rtx, rtx, bool); +extern void s390_expand_int_spaceship (rtx, rtx, rtx, rtx); +extern void s390_expand_fp_spaceship (rtx, rtx, rtx, rtx); extern rtx s390_build_signbit_mask (machine_mode); extern rtx s390_return_addr_rtx (int, rtx); extern rtx s390_back_chain_rtx (void); diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index b5e636c..012b6db 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -8213,6 +8213,167 @@ s390_expand_atomic (machine_mode mode, enum rtx_code code, NULL_RTX, 1, OPTAB_DIRECT), 1); } +/* Expand integer op0 = op1 <=> op2, i.e., + op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : 1. + + Signedness is specified by op3. If op3 equals 1, then perform an unsigned + comparison, and if op3 equals -1, then perform a signed comparison. + + For integer comparisons we strive for a sequence like + CR[L] ; LHI ; LOCHIL ; LOCHIH + where the first three instructions fit into a group. */ + +void +s390_expand_int_spaceship (rtx op0, rtx op1, rtx op2, rtx op3) +{ + gcc_assert (op3 == const1_rtx || op3 == constm1_rtx); + + rtx cc, cond_lt, cond_gt; + machine_mode cc_mode; + machine_mode mode = GET_MODE (op1); + + /* Prior VXE3 emulate a 128-bit comparison by breaking it up into three + comparisons. First test the high halfs. In case they equal, then test + the low halfs. Finally, test for equality. Depending on the results + make use of LOCs. */ + if (mode == TImode && !TARGET_VXE3) + { + gcc_assert (TARGET_VX); + op1 + = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op1, TImode, 0)); + op2 + = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op2, TImode, 0)); + rtx lab = gen_label_rtx (); + rtx ccz = gen_rtx_REG (CCZmode, CC_REGNUM); + /* Compare high halfs for equality. + VEC[L]G op1, op2 sets + CC1 if high(op1) < high(op2) + and + CC2 if high(op1) > high(op2). */ + machine_mode cc_mode = op3 == const1_rtx ? CCUmode : CCSmode; + rtx lane0 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + emit_insn (gen_rtx_SET ( + gen_rtx_REG (cc_mode, CC_REGNUM), + gen_rtx_COMPARE (cc_mode, + gen_rtx_VEC_SELECT (DImode, op1, lane0), + gen_rtx_VEC_SELECT (DImode, op2, lane0)))); + s390_emit_jump (lab, gen_rtx_NE (CCZmode, ccz, const0_rtx)); + /* At this point we know that the high halfs equal. + VCHLGS op2, op1 sets CC1 if low(op1) < low(op2) */ + emit_insn (gen_rtx_PARALLEL ( + VOIDmode, + gen_rtvec (2, + gen_rtx_SET (gen_rtx_REG (CCVIHUmode, CC_REGNUM), + gen_rtx_COMPARE (CCVIHUmode, op2, op1)), + gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (V2DImode))))); + emit_label (lab); + emit_insn (gen_rtx_SET (op0, const1_rtx)); + emit_insn ( + gen_movsicc (op0, + gen_rtx_LTU (CCUmode, gen_rtx_REG (CCUmode, CC_REGNUM), + const0_rtx), + constm1_rtx, op0)); + /* Deal with the case where both halfs equal. */ + emit_insn (gen_rtx_PARALLEL ( + VOIDmode, + gen_rtvec (2, + gen_rtx_SET (gen_rtx_REG (CCVEQmode, CC_REGNUM), + gen_rtx_COMPARE (CCVEQmode, op1, op2)), + gen_rtx_SET (gen_reg_rtx (V2DImode), + gen_rtx_EQ (V2DImode, op1, op2))))); + emit_insn (gen_movsicc (op0, gen_rtx_EQ (CCZmode, ccz, const0_rtx), + const0_rtx, op0)); + return; + } + + if (mode == QImode || mode == HImode) + { + rtx_code extend = op3 == const1_rtx ? ZERO_EXTEND : SIGN_EXTEND; + op1 = simplify_gen_unary (extend, SImode, op1, mode); + op1 = force_reg (SImode, op1); + op2 = simplify_gen_unary (extend, SImode, op2, mode); + op2 = force_reg (SImode, op2); + mode = SImode; + } + + if (op3 == const1_rtx) + { + cc_mode = CCUmode; + cc = gen_rtx_REG (cc_mode, CC_REGNUM); + cond_lt = gen_rtx_LTU (mode, cc, const0_rtx); + cond_gt = gen_rtx_GTU (mode, cc, const0_rtx); + } + else + { + cc_mode = CCSmode; + cc = gen_rtx_REG (cc_mode, CC_REGNUM); + cond_lt = gen_rtx_LT (mode, cc, const0_rtx); + cond_gt = gen_rtx_GT (mode, cc, const0_rtx); + } + + emit_insn (gen_rtx_SET (cc, gen_rtx_COMPARE (cc_mode, op1, op2))); + emit_move_insn (op0, const0_rtx); + emit_insn (gen_movsicc (op0, cond_lt, constm1_rtx, op0)); + emit_insn (gen_movsicc (op0, cond_gt, const1_rtx, op0)); +} + +/* Expand floating-point op0 = op1 <=> op2, i.e., + op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : op1 > op2 ? 1 : 2. + + If op3 equals const0_rtx, then we are interested in the compare only (see + test spaceship-fp-4.c). Otherwise, op3 is a CONST_INT different than + const1_rtx and constm1_rtx which is used in order to set op0 for unordered. + + Emit a branch-only solution, i.e., let if-convert fold the branches into + LOCs if applicable. This has the benefit that the solution is also + applicable if we are only interested in the compare, i.e., if op3 equals + const0_rtx. + */ + +void +s390_expand_fp_spaceship (rtx op0, rtx op1, rtx op2, rtx op3) +{ + gcc_assert (op3 != const1_rtx && op3 != constm1_rtx); + + machine_mode mode = GET_MODE (op1); + machine_mode cc_mode = s390_select_ccmode (LTGT, op1, op2); + rtx cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM); + rtx cond_unordered = gen_rtx_UNORDERED (mode, cc_reg, const0_rtx); + rtx cond_eq = gen_rtx_EQ (mode, cc_reg, const0_rtx); + rtx cond_gt = gen_rtx_GT (mode, cc_reg, const0_rtx); + rtx_insn *insn; + rtx l_unordered = gen_label_rtx (); + rtx l_eq = gen_label_rtx (); + rtx l_gt = gen_label_rtx (); + rtx l_end = gen_label_rtx (); + + s390_emit_compare (VOIDmode, LTGT, op1, op2); + if (!flag_finite_math_only) + { + insn = s390_emit_jump (l_unordered, cond_unordered); + add_reg_br_prob_note (insn, profile_probability::very_unlikely ()); + } + insn = s390_emit_jump (l_eq, cond_eq); + add_reg_br_prob_note (insn, profile_probability::unlikely ()); + insn = s390_emit_jump (l_gt, cond_gt); + add_reg_br_prob_note (insn, profile_probability::even ()); + emit_move_insn (op0, constm1_rtx); + emit_jump (l_end); + emit_label (l_eq); + emit_move_insn (op0, const0_rtx); + emit_jump (l_end); + emit_label (l_gt); + emit_move_insn (op0, const1_rtx); + if (!flag_finite_math_only) + { + emit_jump (l_end); + emit_label (l_unordered); + rtx unord_val = op3 == const0_rtx ? const2_rtx : op3; + emit_move_insn (op0, unord_val); + } + emit_label (l_end); +} + /* This is called from dwarf2out.cc via TARGET_ASM_OUTPUT_DWARF_DTPREL. We need to emit DTP-relative relocations. */ @@ -16874,8 +17035,9 @@ s390_valid_target_attribute_inner_p (tree args, generate_option (opt, NULL, value, CL_TARGET, &decoded); s390_handle_option (opts, new_opts_set, &decoded, input_location); set_option (opts, new_opts_set, opt, value, - p + opt_len, DK_UNSPECIFIED, input_location, - global_dc); + p + opt_len, + static_cast<int> (diagnostics::kind::unspecified), + input_location, global_dc); } else { @@ -16892,8 +17054,9 @@ s390_valid_target_attribute_inner_p (tree args, arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET); if (arg_ok) set_option (opts, new_opts_set, opt, value, - p + opt_len, DK_UNSPECIFIED, input_location, - global_dc); + p + opt_len, + static_cast<int> (diagnostics::kind::unspecified), + input_location, global_dc); else { error ("attribute %<target%> argument %qs is unknown", orig_p); @@ -17345,13 +17508,15 @@ static bool s390_support_vector_misalignment (machine_mode mode ATTRIBUTE_UNUSED, const_tree type ATTRIBUTE_UNUSED, int misalignment ATTRIBUTE_UNUSED, - bool is_packed ATTRIBUTE_UNUSED) + bool is_packed ATTRIBUTE_UNUSED, + bool is_gather_scatter ATTRIBUTE_UNUSED) { if (TARGET_VX) return true; return default_builtin_support_vector_misalignment (mode, type, misalignment, - is_packed); + is_packed, + is_gather_scatter); } /* The vector ABI requires vector types to be aligned on an 8 byte @@ -17843,9 +18008,11 @@ f_constraint_p (const char *constraint) for (size_t i = 0, c_len = strlen (constraint); i < c_len; i += CONSTRAINT_LEN (constraint[i], constraint + i)) { - if (constraint[i] == 'f') + if (constraint[i] == 'f' + || (constraint[i] == '{' && constraint[i + 1] == 'f')) seen_f_p = true; - if (constraint[i] == 'v') + if (constraint[i] == 'v' + || (constraint[i] == '{' && constraint[i + 1] == 'v')) seen_v_p = true; } @@ -17935,7 +18102,8 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs, continue; bool allows_mem, allows_reg, is_inout; bool ok = parse_output_constraint (&constraint, i, ninputs, noutputs, - &allows_mem, &allows_reg, &is_inout); + &allows_mem, &allows_reg, &is_inout, + nullptr); gcc_assert (ok); if (!f_constraint_p (constraint)) /* Long double with a constraint other than "=f" - nothing to do. */ @@ -17980,7 +18148,7 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs, bool allows_mem, allows_reg; bool ok = parse_input_constraint (&constraint, i, ninputs, noutputs, 0, constraints.address (), &allows_mem, - &allows_reg); + &allows_reg, nullptr); gcc_assert (ok); if (!f_constraint_p (constraint)) /* Long double with a constraint other than "f" (or "=f" for inout diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 1edbfde..8cc48b0 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -1527,6 +1527,27 @@ operands[0] = SET_DEST (PATTERN (curr_insn)); }) +; Restrict spaceship optab to z13 or later since there we have +; LOAD HALFWORD IMMEDIATE ON CONDITION. + +(define_mode_iterator SPACESHIP_INT [(TI "TARGET_VX") DI SI HI QI]) +(define_expand "spaceship<mode>4" + [(match_operand:SI 0 "register_operand") + (match_operand:SPACESHIP_INT 1 "register_operand") + (match_operand:SPACESHIP_INT 2 "register_operand") + (match_operand:SI 3 "const_int_operand")] + "TARGET_Z13 && TARGET_64BIT" + "s390_expand_int_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;") + +(define_mode_iterator SPACESHIP_BFP [TF DF SF]) +(define_expand "spaceship<mode>4" + [(match_operand:SI 0 "register_operand") + (match_operand:SPACESHIP_BFP 1 "register_operand") + (match_operand:SPACESHIP_BFP 2 "register_operand") + (match_operand:SI 3 "const_int_operand")] + "TARGET_Z13 && TARGET_64BIT && TARGET_HARD_FLOAT" + "s390_expand_fp_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;") + ; (TF|DF|SF|TD|DD|SD) instructions diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc index b75cec1..d75cba4 100644 --- a/gcc/config/xtensa/xtensa.cc +++ b/gcc/config/xtensa/xtensa.cc @@ -601,8 +601,8 @@ constantpool_address_p (const_rtx addr) /* Make sure the address is word aligned. */ offset = XEXP (addr, 1); - if ((!CONST_INT_P (offset)) - || ((INTVAL (offset) & 3) != 0)) + if (! CONST_INT_P (offset) + || (INTVAL (offset) & 3) != 0) return false; sym = XEXP (addr, 0); @@ -611,6 +611,7 @@ constantpool_address_p (const_rtx addr) if (SYMBOL_REF_P (sym) && CONSTANT_POOL_ADDRESS_P (sym)) return true; + return false; } @@ -4694,29 +4695,56 @@ xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code, } } +/* Return TRUE if the specified insn corresponds to one or more L32R machine + instructions. */ + static bool xtensa_is_insn_L32R_p (const rtx_insn *insn) { - rtx x = PATTERN (insn); + rtx pat, dest, src; + machine_mode mode; - if (GET_CODE (x) != SET) + /* RTX insns that are not "(set (reg) ...)" cannot become L32R instructions: + - it is permitted to apply PATTERN() to the insn without validation. + See insn_cost() in gcc/rtlanal.cc. + - it is used register_operand() instead of REG() to identify things that + don't look like REGs but will eventually become so as well. */ + if (GET_CODE (pat = PATTERN (insn)) != SET + || ! register_operand (dest = SET_DEST (pat), VOIDmode)) return false; - x = XEXP (x, 1); - if (MEM_P (x)) - { - x = XEXP (x, 0); - return (SYMBOL_REF_P (x) || CONST_INT_P (x)) - && CONSTANT_POOL_ADDRESS_P (x); - } - - /* relaxed MOVI instructions, that will be converted to L32R by the - assembler. */ - if (CONST_INT_P (x) - && ! xtensa_simm12b (INTVAL (x))) + /* If the source is a reference to a literal pool entry, then the insn + obviously corresponds to an L32R instruction. */ + if (constantpool_mem_p (src = SET_SRC (pat))) return true; - return false; + /* Similarly, an insn whose source is not a constant obviously does not + correspond to L32R. */ + if (! CONSTANT_P (src)) + return false; + + /* If the source is a CONST_INT whose value fits into signed 12 bits, then + the insn corresponds to a MOVI instruction (rather than an L32R one), + regardless of the configuration of TARGET_CONST16 or + TARGET_AUTOLITPOOLS. Note that the destination register can be non- + SImode. */ + if (((mode = GET_MODE (dest)) == SImode + || mode == HImode || mode == SFmode) + && CONST_INT_P (src) && xtensa_simm12b (INTVAL (src))) + return false; + + /* If TARGET_CONST16 is configured, constants of the remaining forms + correspond to pairs of CONST16 instructions, not L32R. */ + if (TARGET_CONST16) + return false; + + /* The last remaining form of constant is one of the following: + - CONST_INTs with large values + - floating-point constants + - symbolic constants + and is all handled by a relaxed MOVI instruction, which is later + converted to an L32R instruction by the assembler. */ + return true; } /* Compute a relative costs of RTL insns. This is necessary in order to @@ -4725,7 +4753,7 @@ xtensa_is_insn_L32R_p (const rtx_insn *insn) static int xtensa_insn_cost (rtx_insn *insn, bool speed) { - if (!(recog_memoized (insn) < 0)) + if (! (recog_memoized (insn) < 0)) { int len = get_attr_length (insn); @@ -4738,7 +4766,7 @@ xtensa_insn_cost (rtx_insn *insn, bool speed) /* "L32R" may be particular slow (implementation-dependent). */ if (xtensa_is_insn_L32R_p (insn)) - return COSTS_N_INSNS (1 + xtensa_extra_l32r_costs); + return COSTS_N_INSNS ((1 + xtensa_extra_l32r_costs) * n); /* Cost based on the pipeline model. */ switch (get_attr_type (insn)) @@ -4783,7 +4811,7 @@ xtensa_insn_cost (rtx_insn *insn, bool speed) { /* "L32R" itself plus constant in litpool. */ if (xtensa_is_insn_L32R_p (insn)) - len = 3 + 4; + len += (len / 3) * 4; /* Consider fractional instruction length (for example, ".n" short instructions or "L32R" litpool constants. */ diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md index 029be99..629dfdd 100644 --- a/gcc/config/xtensa/xtensa.md +++ b/gcc/config/xtensa/xtensa.md @@ -1297,7 +1297,10 @@ std::swap (operands[0], operands[1]); std::swap (operands[2], operands[3]); } -}) +} + [(set_attr "type" "move,move,load,load,store") + (set_attr "mode" "DI") + (set_attr "length" "6,12,6,6,6")]) (define_split [(set (match_operand:DI 0 "register_operand") @@ -1344,7 +1347,7 @@ %v0s32i\t%1, %0 rsr\t%0, ACCLO wsr\t%1, ACCLO" - [(set_attr "type" "move,move,move,load,store,store,move,move,move,move,move,load,load,store,rsr,wsr") + [(set_attr "type" "move,move,move,load,store,store,move,move,move,load,move,load,load,store,rsr,wsr") (set_attr "mode" "SI") (set_attr "length" "2,2,2,2,2,2,3,3,3,3,6,3,3,3,3,3")]) @@ -1410,7 +1413,7 @@ %v0s16i\t%1, %0 rsr\t%0, ACCLO wsr\t%1, ACCLO" - [(set_attr "type" "move,move,move,move,move,load,load,store,rsr,wsr") + [(set_attr "type" "move,move,move,move,load,load,load,store,rsr,wsr") (set_attr "mode" "HI") (set_attr "length" "2,2,3,3,3,3,3,3,3,3")]) @@ -1519,7 +1522,7 @@ const16\t%0, %t1\;const16\t%0, %b1 %v1l32i\t%0, %1 %v0s32i\t%1, %0" - [(set_attr "type" "farith,fload,fstore,move,load,load,store,move,farith,farith,move,move,load,store") + [(set_attr "type" "farith,fload,fstore,move,load,load,store,move,farith,farith,load,move,load,store") (set_attr "mode" "SF") (set_attr "length" "3,3,3,2,3,2,2,3,3,3,3,6,3,3")]) @@ -1643,7 +1646,10 @@ std::swap (operands[0], operands[1]); std::swap (operands[2], operands[3]); } -}) +} + [(set_attr "type" "move,load,move,load,load,store") + (set_attr "mode" "DF") + (set_attr "length" "6,6,12,6,6,6")]) ;; Block moves |