diff options
Diffstat (limited to 'gcc/config')
31 files changed, 851 insertions, 222 deletions
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def index 1c3e697..db88df0 100644 --- a/gcc/config/aarch64/aarch64-option-extensions.def +++ b/gcc/config/aarch64/aarch64-option-extensions.def @@ -128,7 +128,9 @@ AARCH64_OPT_FMV_EXTENSION("sha2", SHA2, (SIMD), (), (), "sha1 sha2") AARCH64_FMV_FEATURE("sha3", SHA3, (SHA3)) -AARCH64_OPT_FMV_EXTENSION("aes", AES, (SIMD), (), (), "aes") +AARCH64_OPT_EXTENSION("aes", AES, (SIMD), (), (), "aes") + +AARCH64_FMV_FEATURE("aes", PMULL, (AES)) /* +nocrypto disables AES, SHA2 and SM4, and anything that depends on them (such as SHA3 and the SVE2 crypto extensions). */ @@ -171,8 +173,6 @@ AARCH64_OPT_FMV_EXTENSION("i8mm", I8MM, (SIMD), (), (), "i8mm") instructions. */ AARCH64_OPT_FMV_EXTENSION("bf16", BF16, (FP), (SIMD), (), "bf16") -AARCH64_FMV_FEATURE("rpres", RPRES, ()) - AARCH64_OPT_FMV_EXTENSION("sve", SVE, (SIMD, F16, FCMA), (), (), "sve") /* This specifically does not imply +sve. */ @@ -190,7 +190,7 @@ AARCH64_OPT_FMV_EXTENSION("sve2", SVE2, (SVE), (), (), "sve2") AARCH64_OPT_EXTENSION("sve2-aes", SVE2_AES, (SVE2, AES), (), (), "sveaes") -AARCH64_FMV_FEATURE("sve2-aes", SVE_AES, (SVE2_AES)) +AARCH64_FMV_FEATURE("sve2-aes", SVE_PMULL128, (SVE2_AES)) AARCH64_OPT_EXTENSION("sve2-bitperm", SVE2_BITPERM, (SVE2), (), (), "svebitperm") @@ -245,9 +245,9 @@ AARCH64_OPT_EXTENSION("sme-b16b16", SME_B16B16, (SME2, SVE_B16B16), (), (), "sme AARCH64_OPT_EXTENSION("sme-f16f16", SME_F16F16, (SME2), (), (), "smef16f16") -AARCH64_OPT_EXTENSION("mops", MOPS, (), (), (), "mops") +AARCH64_OPT_FMV_EXTENSION("mops", MOPS, (), (), (), "mops") -AARCH64_OPT_EXTENSION("cssc", CSSC, (), (), (), "cssc") +AARCH64_OPT_FMV_EXTENSION("cssc", CSSC, (), (), (), "cssc") AARCH64_OPT_EXTENSION("cmpbr", CMPBR, (), (), (), "cmpbr") diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index e946e8d..38c307c 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1031,6 +1031,7 @@ rtx aarch64_pfalse_reg (machine_mode); bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *); rtx aarch64_sve_packed_pred (machine_mode); rtx aarch64_sve_fp_pred (machine_mode, rtx *); +rtx aarch64_sve_emit_masked_fp_pred (machine_mode, rtx); void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode); bool aarch64_expand_maskloadstore (rtx *, machine_mode); void aarch64_emit_sve_pred_move (rtx, rtx, rtx); diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md index 6b3f439..6b1a747 100644 --- a/gcc/config/aarch64/aarch64-sme.md +++ b/gcc/config/aarch64/aarch64-sme.md @@ -62,6 +62,10 @@ ;; (b) they are sometimes used conditionally, particularly in streaming- ;; compatible code. ;; +;; To prevent the latter from upsetting the assembler, we emit the literal +;; encodings of "SMSTART SM" and "SMSTOP SM" when compiling without +;; TARGET_SME. +;; ;; ========================================================================= ;; ------------------------------------------------------------------------- @@ -161,7 +165,9 @@ (clobber (reg:VNx16BI P14_REGNUM)) (clobber (reg:VNx16BI P15_REGNUM))] "" - "smstart\tsm" + { + return TARGET_SME ? "smstart\tsm" : ".inst 0xd503437f // smstart sm"; + } ) ;; Turn off streaming mode. This clobbers all SVE state. @@ -196,7 +202,9 @@ (clobber (reg:VNx16BI P14_REGNUM)) (clobber (reg:VNx16BI P15_REGNUM))] "" - "smstop\tsm" + { + return TARGET_SME ? "smstop\tsm" : ".inst 0xd503427f // smstop sm"; + } ) ;; ------------------------------------------------------------------------- diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sme.def b/gcc/config/aarch64/aarch64-sve-builtins-sme.def index 8e6aadc..117b70e 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-sme.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-sme.def @@ -92,7 +92,8 @@ DEF_SME_FUNCTION (svstr_zt, str_zt, none, none) DEF_SME_FUNCTION (svzero_zt, inherent_zt, none, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 && AARCH64_FL_FAMINMAX) +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 \ + | AARCH64_FL_FAMINMAX) DEF_SME_FUNCTION_GS (svamin, binary_opt_single_n, all_float, x24, none) DEF_SME_FUNCTION_GS (svamax, binary_opt_single_n, all_float, x24, none) #undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index 2b627a9..01833a8 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -4004,7 +4004,8 @@ rtx function_expander::get_reg_target () { machine_mode target_mode = result_mode (); - if (!possible_target || GET_MODE (possible_target) != target_mode) + if (!possible_target + || !register_operand (possible_target, target_mode)) possible_target = gen_reg_rtx (target_mode); return possible_target; } diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index b252eef..80a3288 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -5605,18 +5605,21 @@ ;; Predicated floating-point operations with merging. (define_expand "@cond_<optab><mode>" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "<sve_pred_fp_rhs1_operand>") - (match_operand:SVE_FULL_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")] + (match_operand:SVE_F_B16B16 2 "<sve_pred_fp_rhs1_operand>") + (match_operand:SVE_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")] SVE_COND_FP_BINARY) - (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>)" + { + operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]); + } ) ;; Predicated floating-point operations, merging with the first input. @@ -5644,14 +5647,14 @@ ) (define_insn "*cond_<optab><mode>_2_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) (match_dup 2)] UNSPEC_SEL))] @@ -5687,14 +5690,14 @@ ) (define_insn "*cond_<optab><mode>_2_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")] SVE_COND_FP_BINARY_I1) (match_dup 2)] UNSPEC_SEL))] @@ -5730,14 +5733,14 @@ ) (define_insn "*cond_<optab><mode>_3_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) (match_dup 3)] UNSPEC_SEL))] @@ -5794,16 +5797,16 @@ ) (define_insn_and_rewrite "*cond_<optab><mode>_any_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) - (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>) @@ -5868,16 +5871,16 @@ ) (define_insn_and_rewrite "*cond_<optab><mode>_any_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")] SVE_COND_FP_BINARY_I1) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" {@ [ cons: =0 , 1 , 2 , 4 ] @@ -5953,14 +5956,14 @@ ) (define_insn "*cond_add<mode>_2_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")] UNSPEC_COND_FADD) (match_dup 2)] UNSPEC_SEL))] @@ -6015,16 +6018,16 @@ ) (define_insn_and_rewrite "*cond_add<mode>_any_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")] UNSPEC_COND_FADD) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" {@ [ cons: =0 , 1 , 2 , 3 , 4 ] @@ -6266,14 +6269,14 @@ ) (define_insn "*cond_sub<mode>_3_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate") + (match_operand:SVE_F 3 "register_operand")] UNSPEC_COND_FSUB) (match_dup 3)] UNSPEC_SEL))] @@ -6323,16 +6326,16 @@ ) (define_insn_and_rewrite "*cond_sub<mode>_const_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate") + (match_operand:SVE_F 3 "register_operand")] UNSPEC_COND_FSUB) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[3], operands[4])" {@ [ cons: =0 , 1 , 3 , 4 ] @@ -6913,7 +6916,7 @@ ;; Predicate AND. We can reuse one of the inputs as the GP. ;; Doubling the second operand is the preferred implementation ;; of the MOV alias, so we use that instead of %1/z, %1, %2. -(define_insn "and<mode>3" +(define_insn "@and<mode>3" [(set (match_operand:PRED_ALL 0 "register_operand") (and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand") (match_operand:PRED_ALL 2 "register_operand")))] @@ -7595,29 +7598,29 @@ ;; Unpredicated floating-point ternary operations. (define_expand "<optab><mode>4" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_dup 4) - (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 1 "register_operand") - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_dup 5) + (match_operand:SVE_F_B16B16 1 "register_operand") + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_TERNARY))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>)" { - operands[4] = aarch64_ptrue_reg (<VPRED>mode); + operands[4] = aarch64_sve_fp_pred (<MODE>mode, &operands[5]); } ) ;; Predicated floating-point ternary operations. (define_insn "@aarch64_pred_<optab><mode>" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") (match_operand:SI 5 "aarch64_sve_gp_strictness") - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>)" {@ [ cons: =0 , 1 , %2 , 3 , 4 ; attrs: movprfx , is_rev ] @@ -7631,17 +7634,17 @@ ;; Predicated floating-point ternary operations with merging. (define_expand "@cond_<optab><mode>" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) - (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>)" { @@ -7649,20 +7652,22 @@ second of the two. */ if (rtx_equal_p (operands[3], operands[5])) std::swap (operands[2], operands[3]); + + operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]); }) ;; Predicated floating-point ternary operations, merging with the ;; first input. (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "register_operand") - (match_operand:SVE_FULL_F 4 "register_operand")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "register_operand") + (match_operand:SVE_F 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 2)] UNSPEC_SEL))] @@ -7678,15 +7683,15 @@ ) (define_insn "*cond_<optab><mode>_2_strict" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "register_operand") - (match_operand:SVE_FULL_F 4 "register_operand")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "register_operand") + (match_operand:SVE_F 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 2)] UNSPEC_SEL))] @@ -7700,15 +7705,15 @@ ;; Predicated floating-point ternary operations, merging with the ;; third input. (define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 4)] UNSPEC_SEL))] @@ -7724,15 +7729,15 @@ ) (define_insn "*cond_<optab><mode>_4_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) (match_dup 4)] UNSPEC_SEL))] @@ -7746,17 +7751,17 @@ ;; Predicated floating-point ternary operations, merging with an ;; independent value. (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 6) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) - (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>) @@ -7792,17 +7797,17 @@ ) (define_insn_and_rewrite "*cond_<optab><mode>_any_strict" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 + [(match_operand:<VPRED> 1 "aarch64_predicate_operand") + (unspec:SVE_F_B16B16 [(match_dup 1) (const_int SVE_STRICT_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand") + (match_operand:SVE_F_B16B16 4 "register_operand")] SVE_COND_FP_TERNARY) - (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && (<supports_bf16> || !<is_bf16>) @@ -8201,20 +8206,23 @@ ;; ;; For unpacked vectors, it doesn't really matter whether SEL uses the ;; the container size or the element size. If SEL used the container size, -;; it would ignore undefined bits of the predicate but would copy the -;; upper (undefined) bits of each container along with the defined bits. -;; If SEL used the element size, it would use undefined bits of the predicate -;; to select between undefined elements in each input vector. Thus the only -;; difference is whether the undefined bits in a container always come from -;; the same input as the defined bits, or whether the choice can vary -;; independently of the defined bits. +;; it would would copy the upper (undefined) bits of each container along +;; with the corresponding defined bits. If SEL used the element size, +;; it would use separate predicate bits to select between the undefined +;; elements in each input vector; these seperate predicate bits might +;; themselves be undefined, depending on the mode of the predicate. +;; +;; Thus the only difference is whether the undefined bits in a container +;; always come from the same input as the defined bits, or whether the +;; choice can vary independently of the defined bits. ;; ;; For the other instructions, using the element size is more natural, ;; so we do that for SEL as well. +;; (define_insn "*vcond_mask_<mode><vpred>" [(set (match_operand:SVE_ALL 0 "register_operand") (unspec:SVE_ALL - [(match_operand:<VPRED> 3 "register_operand") + [(match_operand:<VPRED> 3 "aarch64_predicate_operand") (match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm") (match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index cb1699a..f4a2062 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -3933,6 +3933,33 @@ aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness) return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode)); } +/* PRED is a predicate that governs an operation on DATA_MODE. If DATA_MODE + is a partial vector mode, and if exceptions must be suppressed for its + undefined elements, convert PRED from a container-level predicate to + an element-level predicate and ensure that the undefined elements + are inactive. Make no changes otherwise. + + Return the resultant predicate. */ +rtx +aarch64_sve_emit_masked_fp_pred (machine_mode data_mode, rtx pred) +{ + unsigned int vec_flags = aarch64_classify_vector_mode (data_mode); + if (flag_trapping_math && (vec_flags & VEC_PARTIAL)) + { + /* Generate an element-level mask. */ + rtx mask = aarch64_sve_packed_pred (data_mode); + machine_mode pmode = GET_MODE (mask); + + /* Apply the existing predicate. */ + rtx dst = gen_reg_rtx (pmode); + emit_insn (gen_and3 (pmode, dst, mask, + gen_lowpart (pmode, pred))); + return dst; + } + + return pred; +} + /* Emit a comparison CMP between OP0 and OP1, both of which have mode DATA_MODE, and return the result in a predicate of mode PRED_MODE. Use TARGET as the target register if nonnull and convenient. */ @@ -17166,8 +17193,8 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info, && STMT_VINFO_DATA_REF (stmt_info)) { stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); - if (stmt_info - && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES) + if (node + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES) return DR_GROUP_SIZE (stmt_info); } return 0; @@ -17438,8 +17465,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, for each element. We therefore need to divide the full-instruction cost by the number of elements in the vector. */ if (kind == scalar_load + && node && sve_costs - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { unsigned int nunits = vect_nunits_for_cost (vectype); /* Test for VNx2 modes, which have 64-bit containers. */ @@ -17451,8 +17479,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, /* Detect cases in which a scalar_store is really storing one element in a scatter operation. */ if (kind == scalar_store + && node && sve_costs - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) return sve_costs->scatter_store_elt_cost; /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ @@ -17708,7 +17737,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && kind == vec_to_scalar && (m_vec_flags & VEC_ADVSIMD) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { auto dr = STMT_VINFO_DATA_REF (stmt_info); tree dr_ref = DR_REF (dr); @@ -17823,7 +17852,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && sve_issue && (kind == scalar_load || kind == scalar_store) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { unsigned int pairs = CEIL (count, 2); ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs; @@ -17978,9 +18007,10 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Check if we've seen an SVE gather/scatter operation and which size. */ if (kind == scalar_load + && node && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve; if (sve_costs) @@ -20482,6 +20512,8 @@ aarch64_compare_version_priority (tree decl1, tree decl2) unsigned long _size; // Size of the struct, so it can grow. unsigned long _hwcap; unsigned long _hwcap2; + unsigned long _hwcap3; + unsigned long _hwcap4; } */ @@ -20498,14 +20530,24 @@ build_ifunc_arg_type () tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, get_identifier ("_hwcap2"), long_unsigned_type_node); + tree field4 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier ("_hwcap3"), + long_unsigned_type_node); + tree field5 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier ("_hwcap4"), + long_unsigned_type_node); DECL_FIELD_CONTEXT (field1) = ifunc_arg_type; DECL_FIELD_CONTEXT (field2) = ifunc_arg_type; DECL_FIELD_CONTEXT (field3) = ifunc_arg_type; + DECL_FIELD_CONTEXT (field4) = ifunc_arg_type; + DECL_FIELD_CONTEXT (field5) = ifunc_arg_type; TYPE_FIELDS (ifunc_arg_type) = field1; DECL_CHAIN (field1) = field2; DECL_CHAIN (field2) = field3; + DECL_CHAIN (field3) = field4; + DECL_CHAIN (field4) = field5; layout_type (ifunc_arg_type); @@ -31964,9 +32006,43 @@ aarch64_test_sysreg_encoding_clashes (void) static void aarch64_test_sve_folding () { + aarch64_target_switcher switcher (AARCH64_FL_SVE); + tree res = fold_unary (BIT_NOT_EXPR, ssizetype, ssize_int (poly_int64 (1, 1))); ASSERT_TRUE (operand_equal_p (res, ssize_int (poly_int64 (-2, -1)))); + + auto build_v16bi = [](bool a, bool b) + { + rtx_vector_builder builder (VNx16BImode, 2, 1); + builder.quick_push (a ? const1_rtx : const0_rtx); + builder.quick_push (b ? const1_rtx : const0_rtx); + return builder.build (); + }; + rtx v16bi_10 = build_v16bi (1, 0); + rtx v16bi_01 = build_v16bi (0, 1); + + for (auto mode : { VNx8BImode, VNx4BImode, VNx2BImode }) + { + rtx reg = gen_rtx_REG (mode, LAST_VIRTUAL_REGISTER + 1); + rtx subreg = lowpart_subreg (VNx16BImode, reg, mode); + rtx and1 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, and1, VNx16BImode), reg); + rtx and0 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, and0, VNx16BImode), CONST0_RTX (mode)); + + rtx ior1 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, ior1, VNx16BImode), CONSTM1_RTX (mode)); + rtx ior0 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, ior0, VNx16BImode), reg); + + rtx xor1 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_10); + ASSERT_RTX_EQ (lowpart_subreg (mode, xor1, VNx16BImode), + lowpart_subreg (mode, gen_rtx_NOT (VNx16BImode, subreg), + VNx16BImode)); + rtx xor0 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, xor0, VNx16BImode), reg); + } } /* Run all target-specific selftests. */ diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h index f76a250..9eb1a20 100644 --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h @@ -26,7 +26,7 @@ static const struct cpu_addrcost_table generic_armv9_a_addrcost_table = { { - 1, /* hi */ + 0, /* hi */ 0, /* si */ 0, /* di */ 1, /* ti */ diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc index 6a88a27..69df6d2 100644 --- a/gcc/config/avr/avr-passes.cc +++ b/gcc/config/avr/avr-passes.cc @@ -4843,6 +4843,137 @@ avr_pass_fuse_add::execute1 (function *func) ////////////////////////////////////////////////////////////////////////////// +// Fuse 2 move insns after combine. + +static const pass_data avr_pass_data_2moves = +{ + RTL_PASS, // type + "", // name (will be patched) + OPTGROUP_NONE, // optinfo_flags + TV_DF_SCAN, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + 0 // todo_flags_finish +}; + +class avr_pass_2moves : public rtl_opt_pass +{ +public: + avr_pass_2moves (gcc::context *ctxt, const char *name) + : rtl_opt_pass (avr_pass_data_2moves, ctxt) + { + this->name = name; + } + + unsigned int execute (function *func) final override + { + if (optimize && avropt_fuse_move2) + { + bool changed = false; + basic_block bb; + + FOR_EACH_BB_FN (bb, func) + { + changed |= optimize_2moves_bb (bb); + } + + if (changed) + { + df_note_add_problem (); + df_analyze (); + } + } + + return 0; + } + + bool optimize_2moves (rtx_insn *, rtx_insn *); + bool optimize_2moves_bb (basic_block); +}; // avr_pass_2moves + +bool +avr_pass_2moves::optimize_2moves_bb (basic_block bb) +{ + bool changed = false; + rtx_insn *insn1 = nullptr; + rtx_insn *insn2 = nullptr; + rtx_insn *curr; + + FOR_BB_INSNS (bb, curr) + { + if (insn1 && INSN_P (insn1) + && insn2 && INSN_P (insn2)) + changed |= optimize_2moves (insn1, insn2); + + insn1 = insn2; + insn2 = curr; + } + + return changed; +} + +bool +avr_pass_2moves::optimize_2moves (rtx_insn *insn1, rtx_insn *insn2) +{ + bool good = false; + bool bad = false; + rtx set1, dest1, src1; + rtx set2, dest2, src2; + + if ((set1 = single_set (insn1)) + && (set2 = single_set (insn2)) + && (src1 = SET_SRC (set1)) + && REG_P (src2 = SET_SRC (set2)) + && REG_P (dest1 = SET_DEST (set1)) + && REG_P (dest2 = SET_DEST (set2)) + && rtx_equal_p (dest1, src2) + // Now we have: + // insn1: dest1 = src1 + // insn2: dest2 = dest1 + && REGNO (dest1) >= FIRST_PSEUDO_REGISTER + // Paranoia. + && GET_CODE (PATTERN (insn1)) != PARALLEL + && GET_CODE (PATTERN (insn2)) != PARALLEL + && (rtx_equal_p (dest2, src1) + || !reg_overlap_mentioned_p (dest2, src1))) + { + avr_dump ("\n;; Found 2moves:\n%r\n%r\n", insn1, insn2); + avr_dump (";; reg %d: insn uses uids:", REGNO (dest1)); + + // Go check that dest1 is used exactly once, namely by insn2. + + df_ref use = DF_REG_USE_CHAIN (REGNO (dest1)); + for (; use; use = DF_REF_NEXT_REG (use)) + { + rtx_insn *user = DF_REF_INSN (use); + avr_dump (" %d", INSN_UID (user)); + good |= INSN_UID (user) == INSN_UID (insn2); + bad |= INSN_UID (user) != INSN_UID (insn2); + } + avr_dump (".\n"); + + if (good && !bad + // Propagate src1 to insn2: + // insn1: # Deleted + // insn2: dest2 = src1 + && validate_change (insn2, &SET_SRC (set2), src1, false)) + { + SET_INSN_DELETED (insn1); + return true; + } + } + + if (good && !bad) + avr_dump (";; Failed\n"); + + return false; +} + + + +////////////////////////////////////////////////////////////////////////////// // Split insns with nonzero_bits() after combine. static const pass_data avr_pass_data_split_nzb = @@ -5704,6 +5835,14 @@ make_avr_pass_casesi (gcc::context *ctxt) return new avr_pass_casesi (ctxt, "avr-casesi"); } +// Optimize 2 consecutive moves after combine. + +rtl_opt_pass * +make_avr_pass_2moves (gcc::context *ctxt) +{ + return new avr_pass_2moves (ctxt, "avr-2moves"); +} + rtl_opt_pass * make_avr_pass_split_nzb (gcc::context *ctxt) { diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def index eb60a93..d668c7f 100644 --- a/gcc/config/avr/avr-passes.def +++ b/gcc/config/avr/avr-passes.def @@ -74,6 +74,14 @@ INSERT_PASS_BEFORE (pass_free_cfg, 1, avr_pass_recompute_notes); INSERT_PASS_AFTER (pass_expand, 1, avr_pass_casesi); +/* Insn combine may come up with superfluous reg-reg moves, where the combine + people say that these are no problem since reg-alloc is supposed to optimize + them. The issue is that the lower-subreg pass sitting between combine and + reg-alloc may split such moves, coming up with a zoo of subregs which are + only handled poorly by the register allocator. */ + +INSERT_PASS_AFTER (pass_combine, 1, avr_pass_2moves); + /* Some combine insns have nonzero_bits() in their condition, though insns should not use such stuff in their condition. Therefore, we split such insn into something without nonzero_bits() in their condition right after diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h index ca30136..37911e7 100644 --- a/gcc/config/avr/avr-protos.h +++ b/gcc/config/avr/avr-protos.h @@ -208,6 +208,7 @@ extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *); extern rtl_opt_pass *make_avr_pass_ifelse (gcc::context *); extern rtl_opt_pass *make_avr_pass_split_nzb (gcc::context *); extern rtl_opt_pass *make_avr_pass_split_after_peephole2 (gcc::context *); +extern rtl_opt_pass *make_avr_pass_2moves (gcc::context *); #ifdef RTX_CODE extern bool avr_casei_sequence_check_operands (rtx *xop); extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands); diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc index c469297..1fb59b6 100644 --- a/gcc/config/avr/avr.cc +++ b/gcc/config/avr/avr.cc @@ -14418,6 +14418,13 @@ avr_output_addr_vec (rtx_insn *labl, rtx table) // Output the label that precedes the table. ASM_OUTPUT_ALIGN (stream, 1); + + char s_labl[40]; + targetm.asm_out.generate_internal_label (s_labl, "L", + CODE_LABEL_NUMBER (labl)); + ASM_OUTPUT_TYPE_DIRECTIVE (stream, s_labl, + AVR_HAVE_JMP_CALL ? "object" : "function"); + targetm.asm_out.internal_label (stream, "L", CODE_LABEL_NUMBER (labl)); // Output the table's content. @@ -14984,10 +14991,11 @@ avr_addr_space_convert (rtx src, tree type_old, tree type_new) /* Linearize memory: RAM has bit 23 set. When as_new = __flashx then this is basically UB since __flashx mistreats RAM addresses, but there - is no way to bail out. (Though -Waddr-space-convert will tell.) */ + is no way to bail out. (Though -Waddr-space-convert will tell.) + ...but PR121277 is confusing, in particular when NULL is coming in. */ int msb = ADDR_SPACE_GENERIC_P (as_old) - ? 0x80 + ? as_new == ADDR_SPACE_MEMX ? 0x80 : 0x00 : avr_addrspace[as_old].segment; src = force_reg (Pmode, src); @@ -15085,10 +15093,16 @@ avr_convert_to_type (tree type, tree expr) const char *name_old = avr_addrspace[as_old].name; const char *name_new = avr_addrspace[as_new].name; - warning (OPT_Waddr_space_convert, - "conversion from address space %qs to address space %qs", - ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old, - ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new); + // Be relaxed when NULL is used, and when 0x0 stands for + // address 0x0. + bool nowarn = (expr == null_pointer_node + && (as_new == ADDR_SPACE_FLASHX + || as_new == ADDR_SPACE_FLASH)); + if (!nowarn) + warning (OPT_Waddr_space_convert, + "conversion from address space %qs to address space %qs", + ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old, + ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new); return fold_build1_loc (loc, ADDR_SPACE_CONVERT_EXPR, type, expr); } diff --git a/gcc/config/avr/avr.opt b/gcc/config/avr/avr.opt index 9883119..7f6f18c 100644 --- a/gcc/config/avr/avr.opt +++ b/gcc/config/avr/avr.opt @@ -164,6 +164,10 @@ mfuse-move= Target Joined RejectNegative UInteger Var(avropt_fuse_move) Init(0) Optimization IntegerRange(0, 23) -mfuse-move=<0,23> Optimization. Run a post-reload pass that tweaks move instructions. +mfuse-move2 +Target Var(avropt_fuse_move2) Init(0) Optimization +Optimization. Fuse some move insns after insn combine. + mabsdata Target Mask(ABSDATA) Assume that all data in static storage can be accessed by LDS / STS instructions. This option is only useful for reduced Tiny devices like ATtiny40. diff --git a/gcc/config/avr/avr.opt.urls b/gcc/config/avr/avr.opt.urls index 662fdee..87c26b2 100644 --- a/gcc/config/avr/avr.opt.urls +++ b/gcc/config/avr/avr.opt.urls @@ -92,6 +92,9 @@ UrlSuffix(gcc/AVR-Options.html#index-mfuse-move) mfuse-move= UrlSuffix(gcc/AVR-Options.html#index-mfuse-move) +mfuse-move2 +UrlSuffix(gcc/AVR-Options.html#index-mfuse-move2) + mabsdata UrlSuffix(gcc/AVR-Options.html#index-mabsdata) diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 557568c..5ffeb23 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -54,6 +54,7 @@ #include "gimple.h" #include "cgraph.h" #include "case-cfn-macros.h" +#include "opts.h" /* This file should be included last. */ #include "target-def.h" @@ -183,6 +184,11 @@ gcn_option_override (void) if (flag_sram_ecc == HSACO_ATTR_DEFAULT) flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default; + + /* TODO: This seems to produce tighter loops, but the testsuites expects it + to be set to '2', so I'll leave it default for now. + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_vect_partial_vector_usage, 1); */ } /* }}} */ @@ -5789,6 +5795,16 @@ gcn_libc_has_function (enum function_class fn_class, return bsd_libc_has_function (fn_class, type); } +/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */ + +static bool +gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode), + int ARG_UNUSED (scale), + unsigned int ARG_UNUSED (group_size)) +{ + return true; +} + /* }}} */ /* {{{ md_reorg pass. */ @@ -8140,6 +8156,8 @@ gcn_dwarf_register_span (rtx rtl) gcn_vectorize_builtin_vectorized_function #undef TARGET_VECTORIZE_GET_MASK_MODE #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode +#undef TARGET_VECTORIZE_PREFER_GATHER_SCATTER +#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index c131577..53e86c8 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3226,7 +3226,7 @@ remove_partial_avx_dependency (void) break; } - /* Only hanlde conversion here. */ + /* Only handle conversion here. */ machine_mode src_mode = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode; switch (src_mode) diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def index 2fedbeb..c2db305 100644 --- a/gcc/config/i386/i386-modes.def +++ b/gcc/config/i386/i386-modes.def @@ -91,7 +91,6 @@ VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */ VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */ VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */ -VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */ VECTOR_MODE (FLOAT, HF, 2); /* V2HF */ VECTOR_MODE (FLOAT, BF, 2); /* V2BF */ VECTOR_MODE (FLOAT, HF, 6); /* V6HF */ @@ -102,7 +101,6 @@ VECTOR_MODE (INT, QI, 2); /* V2QI */ VECTOR_MODE (INT, QI, 12); /* V12QI */ VECTOR_MODE (INT, QI, 14); /* V14QI */ VECTOR_MODE (INT, HI, 6); /* V6HI */ -VECTOR_MODE (INT, SI, 64); /* V64SI */ INT_MODE (OI, 32); INT_MODE (XI, 64); diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index ca6bb83..09a35ef 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -3615,6 +3615,18 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, return NULL_TREE; } + if (TARGET_64BIT) + { + /* Do not warn when emulating the MS ABI. */ + if ((TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE) + || ix86_function_type_abi (*node) != MS_ABI) + warning (OPT_Wattributes, "%qE attribute ignored", + name); + *no_add_attrs = true; + return NULL_TREE; + } + /* Can combine regparm with all attributes but fastcall, and thiscall. */ if (is_attribute_p ("regparm", name)) { @@ -3627,7 +3639,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) { - error ("regparam and thiscall attributes are not compatible"); + error ("regparm and thiscall attributes are not compatible"); } cst = TREE_VALUE (args); @@ -3648,19 +3660,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, return NULL_TREE; } - if (TARGET_64BIT) - { - /* Do not warn when emulating the MS ABI. */ - if ((TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE) - || ix86_function_type_abi (*node) != MS_ABI) - warning (OPT_Wattributes, "%qE attribute ignored", - name); - *no_add_attrs = true; - return NULL_TREE; - } - - /* Can combine fastcall with stdcall (redundant) and sseregparm. */ + /* Can combine fastcall with sseregparm. */ if (is_attribute_p ("fastcall", name)) { if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) @@ -3681,8 +3681,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, } } - /* Can combine stdcall with fastcall (redundant), regparm and - sseregparm. */ + /* Can combine stdcall with regparm and sseregparm. */ else if (is_attribute_p ("stdcall", name)) { if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) @@ -3732,6 +3731,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, { error ("cdecl and thiscall attributes are not compatible"); } + if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node))) + { + error ("regparm and thiscall attributes are not compatible"); + } } /* Can combine sseregparm with all attributes. */ diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 590cdf1..65e04d3 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -12442,6 +12442,28 @@ static GTY(()) rtx ix86_tls_symbol; static rtx ix86_tls_get_addr (void) { + if (cfun->machine->call_saved_registers + == TYPE_NO_CALLER_SAVED_REGISTERS) + { + /* __tls_get_addr doesn't preserve vector registers. When a + function with no_caller_saved_registers attribute calls + __tls_get_addr, YMM and ZMM registers will be clobbered. + Issue an error and suggest -mtls-dialect=gnu2 in this case. */ + if (cfun->machine->func_type == TYPE_NORMAL) + error (G_("%<-mtls-dialect=gnu2%> must be used with a function" + " with the %<no_caller_saved_registers%> attribute")); + else + error (cfun->machine->func_type == TYPE_EXCEPTION + ? G_("%<-mtls-dialect=gnu2%> must be used with an" + " exception service routine") + : G_("%<-mtls-dialect=gnu2%> must be used with an" + " interrupt service routine")); + /* Don't issue the same error twice. */ + cfun->machine->func_type = TYPE_NORMAL; + cfun->machine->call_saved_registers + = TYPE_DEFAULT_CALL_SAVED_REGISTERS; + } + if (!ix86_tls_symbol) { const char *sym @@ -20007,7 +20029,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) tree utype, ures, vce; utype = unsigned_type_for (TREE_TYPE (arg0)); /* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR - instead of ABS_EXPR to hanlde overflow case(TYPE_MIN). */ + instead of ABS_EXPR to handle overflow case(TYPE_MIN). */ ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0); gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); loc = gimple_location (stmt); @@ -21491,8 +21513,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode) /* Register pair for mask registers. */ if (mode == P2QImode || mode == P2HImode) return 2; - if (mode == V64SFmode || mode == V64SImode) - return 4; + return 1; } @@ -23132,7 +23153,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, So current solution is make constant disp as cheap as possible. */ if (GET_CODE (addr) == PLUS && x86_64_immediate_operand (XEXP (addr, 1), Pmode) - /* Only hanlde (reg + disp) since other forms of addr are mostly LEA, + /* Only handle (reg + disp) since other forms of addr are mostly LEA, there's no additional cost for the plus of disp. */ && register_operand (XEXP (addr, 0), Pmode)) { @@ -25211,20 +25232,14 @@ asm_preferred_eh_data_format (int code, int global) return DW_EH_PE_absptr; } -/* Implement targetm.vectorize.builtin_vectorization_cost. */ +/* Worker for ix86_builtin_vectorization_cost and the fallback calls + from ix86_vector_costs::add_stmt_cost. */ static int -ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, - tree vectype, int) +ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost, + machine_mode mode) { - bool fp = false; - machine_mode mode = TImode; + bool fp = FLOAT_MODE_P (mode); int index; - if (vectype != NULL) - { - fp = FLOAT_TYPE_P (vectype); - mode = TYPE_MODE (vectype); - } - switch (type_of_cost) { case scalar_stmt: @@ -25283,14 +25298,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, COSTS_N_INSNS (ix86_cost->gather_static + ix86_cost->gather_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + * GET_MODE_NUNITS (mode)) / 2); case vector_scatter_store: return ix86_vec_cost (mode, COSTS_N_INSNS (ix86_cost->scatter_static + ix86_cost->scatter_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + * GET_MODE_NUNITS (mode)) / 2); case cond_branch_taken: return ix86_cost->cond_taken_branch_cost; @@ -25308,7 +25323,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_construct: { - int n = TYPE_VECTOR_SUBPARTS (vectype); + int n = GET_MODE_NUNITS (mode); /* N - 1 element inserts into an SSE vector, the possible GPR -> XMM move is accounted for in add_stmt_cost. */ if (GET_MODE_BITSIZE (mode) <= 128) @@ -25336,6 +25351,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* Implement targetm.vectorize.builtin_vectorization_cost. */ +static int +ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype, int) +{ + machine_mode mode = TImode; + if (vectype != NULL) + mode = TYPE_MODE (vectype); + return ix86_default_vector_cost (type_of_cost, mode); +} + /* This function returns the calling abi specific va_list type node. It returns the FNDECL specific va_list type. */ @@ -25789,7 +25815,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) unsigned ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_vec_info stmt_info, slp_tree node, - tree vectype, int misalign, + tree vectype, int, vect_cost_model_location where) { unsigned retval = 0; @@ -26138,14 +26164,14 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER))))) { - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost = ix86_default_vector_cost (kind, mode); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); } else if ((kind == vec_construct || kind == scalar_to_vec) && node && SLP_TREE_DEF_TYPE (node) == vect_external_def) { - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost = ix86_default_vector_cost (kind, mode); unsigned i; tree op; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) @@ -26209,7 +26235,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, TREE_VISITED (op) = 0; } if (stmt_cost == -1) - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost = ix86_default_vector_cost (kind, mode); if (kind == vec_perm && vectype && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index eb52699..a50475b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -2968,7 +2968,8 @@ (match_operand:SWI248 1 "const_int_operand"))] "optimize_insn_for_size_p () && optimize_size > 1 && operands[1] != const0_rtx - && operands[1] != constm1_rtx + && (operands[1] != constm1_rtx + || (<MODE>mode == DImode && LEGACY_INT_REG_P (operands[0]))) && IN_RANGE (INTVAL (operands[1]), -128, 127) && !ix86_red_zone_used && REGNO (operands[0]) != SP_REG" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d88c3d6..ec74f93 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -21729,6 +21729,19 @@ (const_string "orig"))) (set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")]) +;; Eliminate redundancy caused by +;; /* Special case TImode to 128-bit vector conversions via V2DI. */ +;; in ix86_expand_vector_move + +(define_split + [(set (match_operand:V2DI 0 "register_operand") + (vec_concat:V2DI + (subreg:DI (match_operand:TI 1 "register_operand") 0) + (subreg:DI (match_dup 1) 8)))] + "TARGET_SSE2 && ix86_pre_reload_split ()" + [(set (match_dup 0) + (subreg:V2DI (match_dup 1) 0))]) + (define_insn "*vec_concatv2di_0" [(set (match_operand:V2DI 0 "register_operand" "=v,v ,x") (vec_concat:V2DI diff --git a/gcc/config/riscv/arch-canonicalize b/gcc/config/riscv/arch-canonicalize index fd55255..34dad45 100755 --- a/gcc/config/riscv/arch-canonicalize +++ b/gcc/config/riscv/arch-canonicalize @@ -32,7 +32,7 @@ import itertools from functools import reduce SUPPORTED_ISA_SPEC = ["2.2", "20190608", "20191213"] -CANONICAL_ORDER = "imafdgqlcbkjtpvn" +CANONICAL_ORDER = "imafdqlcbkjtpvnh" LONG_EXT_PREFIXES = ['z', 's', 'h', 'x'] # diff --git a/gcc/config/riscv/gen-riscv-mcpu-texi.cc b/gcc/config/riscv/gen-riscv-mcpu-texi.cc new file mode 100644 index 0000000..9681438 --- /dev/null +++ b/gcc/config/riscv/gen-riscv-mcpu-texi.cc @@ -0,0 +1,43 @@ +#include <string> +#include <vector> +#include <stdio.h> + +int +main () +{ + puts ("@c Copyright (C) 2025 Free Software Foundation, Inc."); + puts ("@c This is part of the GCC manual."); + puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi."); + puts (""); + puts ("@c This file is generated automatically using"); + puts ("@c gcc/config/riscv/gen-riscv-mcpu-texi.cc from:"); + puts ("@c gcc/config/riscv/riscv-cores.def"); + puts (""); + puts ("@c Please *DO NOT* edit manually."); + puts (""); + puts ("@samp{Core Name}"); + puts (""); + puts ("@opindex mcpu"); + puts ("@item -mcpu=@var{processor-string}"); + puts ("Use architecture of and optimize the output for the given processor, specified"); + puts ("by particular CPU name. Permissible values for this option are:"); + puts (""); + puts (""); + + std::vector<std::string> coreNames; + +#define RISCV_CORE(CORE_NAME, ARCH, MICRO_ARCH) \ + coreNames.push_back (CORE_NAME); +#include "riscv-cores.def" +#undef RISCV_CORE + + for (size_t i = 0; i < coreNames.size(); ++i) { + if (i == coreNames.size() - 1) { + printf("@samp{%s}.\n", coreNames[i].c_str()); + } else { + printf("@samp{%s},\n\n", coreNames[i].c_str()); + } + } + + return 0; +} diff --git a/gcc/config/riscv/gen-riscv-mtune-texi.cc b/gcc/config/riscv/gen-riscv-mtune-texi.cc new file mode 100644 index 0000000..1bdfe2a --- /dev/null +++ b/gcc/config/riscv/gen-riscv-mtune-texi.cc @@ -0,0 +1,41 @@ +#include <string> +#include <vector> +#include <stdio.h> + +int +main () +{ + puts ("@c Copyright (C) 2025 Free Software Foundation, Inc."); + puts ("@c This is part of the GCC manual."); + puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi."); + puts (""); + puts ("@c This file is generated automatically using"); + puts ("@c gcc/config/riscv/gen-riscv-mtune-texi.cc from:"); + puts ("@c gcc/config/riscv/riscv-cores.def"); + puts (""); + puts ("@c Please *DO NOT* edit manually."); + puts (""); + puts ("@samp{Tune Name}"); + puts (""); + puts ("@opindex mtune"); + puts ("@item -mtune=@var{processor-string}"); + puts ("Optimize the output for the given processor, specified by microarchitecture or"); + puts ("particular CPU name. Permissible values for this option are:"); + puts (""); + puts (""); + + std::vector<std::string> tuneNames; + +#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO) \ + tuneNames.push_back (TUNE_NAME); +#include "riscv-cores.def" +#undef RISCV_TUNE + + for (size_t i = 0; i < tuneNames.size(); ++i) { + printf("@samp{%s},\n\n", tuneNames[i].c_str()); + } + + puts ("and all valid options for @option{-mcpu=}."); + + return 0; +} diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 1c6bc25..44ef44a 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -400,7 +400,7 @@ costs::compute_local_live_ranges ( pair &live_range = live_ranges->get_or_insert (lhs, &existed_p); gcc_assert (!existed_p); - if (STMT_VINFO_MEMORY_ACCESS_TYPE (program_point.stmt_info) + if (SLP_TREE_MEMORY_ACCESS_TYPE (*node) == VMAT_LOAD_STORE_LANES) point = get_first_lane_point (program_points, program_point.stmt_info); @@ -418,8 +418,7 @@ costs::compute_local_live_ranges ( bool existed_p = false; pair &live_range = live_ranges->get_or_insert (var, &existed_p); - if (STMT_VINFO_MEMORY_ACCESS_TYPE ( - program_point.stmt_info) + if (SLP_TREE_MEMORY_ACCESS_TYPE (*node) == VMAT_LOAD_STORE_LANES) point = get_last_lane_point (program_points, program_point.stmt_info); @@ -608,7 +607,7 @@ costs::need_additional_vector_vars_p (stmt_vec_info stmt_info, if (type == load_vec_info_type || type == store_vec_info_type) { if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) return true; machine_mode mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); @@ -1086,7 +1085,7 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const load/store. */ static int segment_loadstore_group_size (enum vect_cost_for_stmt kind, - stmt_vec_info stmt_info) + stmt_vec_info stmt_info, slp_tree node) { if (stmt_info && (kind == vector_load || kind == vector_store) @@ -1094,7 +1093,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind, { stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); if (stmt_info - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES) return DR_GROUP_SIZE (stmt_info); } return 0; @@ -1108,7 +1107,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind, unsigned costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, stmt_vec_info stmt_info, - slp_tree, tree vectype, int stmt_cost) + slp_tree node, tree vectype, int stmt_cost) { const cpu_vector_cost *costs = get_vector_costs (); switch (kind) @@ -1131,7 +1130,8 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, each vector in the group. Here we additionally add permute costs for each. */ /* TODO: Indexed and ordered/unordered cost. */ - int group_size = segment_loadstore_group_size (kind, stmt_info); + int group_size = segment_loadstore_group_size (kind, stmt_info, + node); if (group_size > 1) { switch (group_size) diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv index 7aac56a..a7eaa8b 100644 --- a/gcc/config/riscv/t-riscv +++ b/gcc/config/riscv/t-riscv @@ -229,8 +229,41 @@ s-riscv-ext.texi: build/gen-riscv-ext-texi$(build_exeext) $(SHELL) $(srcdir)/../move-if-change tmp-riscv-ext.texi $(srcdir)/doc/riscv-ext.texi $(STAMP) s-riscv-ext.texi -# Run `riscv-regen' after you changed or added anything from riscv-ext*.def +RISCV_CORES_DEFS = \ + $(srcdir)/config/riscv/riscv-cores.def + +build/gen-riscv-mtune-texi.o: $(srcdir)/config/riscv/gen-riscv-mtune-texi.cc \ + $(RISCV_CORES_DEFS) + $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@ + +build/gen-riscv-mcpu-texi.o: $(srcdir)/config/riscv/gen-riscv-mcpu-texi.cc \ + $(RISCV_CORES_DEFS) + $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@ + +build/gen-riscv-mtune-texi$(build_exeext): build/gen-riscv-mtune-texi.o + $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $< + +build/gen-riscv-mcpu-texi$(build_exeext): build/gen-riscv-mcpu-texi.o + $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $< + +$(srcdir)/doc/riscv-mtune.texi: $(RISCV_CORES_DEFS) +$(srcdir)/doc/riscv-mtune.texi: s-riscv-mtune.texi ; @true + +$(srcdir)/doc/riscv-mcpu.texi: $(RISCV_CORES_DEFS) +$(srcdir)/doc/riscv-mcpu.texi: s-riscv-mcpu.texi ; @true + +s-riscv-mtune.texi: build/gen-riscv-mtune-texi$(build_exeext) + $(RUN_GEN) build/gen-riscv-mtune-texi$(build_exeext) > tmp-riscv-mtune.texi + $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mtune.texi $(srcdir)/doc/riscv-mtune.texi + $(STAMP) s-riscv-mtune.texi + +s-riscv-mcpu.texi: build/gen-riscv-mcpu-texi$(build_exeext) + $(RUN_GEN) build/gen-riscv-mcpu-texi$(build_exeext) > tmp-riscv-mcpu.texi + $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mcpu.texi $(srcdir)/doc/riscv-mcpu.texi + $(STAMP) s-riscv-mcpu.texi + +# Run `riscv-regen' after you changed or added anything from riscv-ext*.def and riscv-cores*.def .PHONY: riscv-regen -riscv-regen: s-riscv-ext.texi s-riscv-ext.opt +riscv-regen: s-riscv-ext.texi s-riscv-ext.opt s-riscv-mtune.texi s-riscv-mcpu.texi diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 1c60695..764b499 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -10320,15 +10320,18 @@ can_be_rotated_to_negative_lis (HOST_WIDE_INT c, int *rot) /* case b. xx0..01..1xx: some of 15 x's (and some of 16 0's) are rotated over the highest bit. */ - int pos_one = clz_hwi ((c << 16) >> 16); - middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one)); - int middle_ones = clz_hwi (~(c << pos_one)); - if (middle_zeros >= 16 && middle_ones >= 33) + unsigned HOST_WIDE_INT uc = c; + int pos_one = clz_hwi ((HOST_WIDE_INT) (uc << 16) >> 16); + if (pos_one != 0) { - *rot = pos_one; - return true; + middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one)); + int middle_ones = clz_hwi (~(uc << pos_one)); + if (middle_zeros >= 16 && middle_ones >= 33) + { + *rot = pos_one; + return true; + } } - return false; } @@ -10445,7 +10448,8 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask) if (lz >= HOST_BITS_PER_WIDE_INT) return false; - int middle_ones = clz_hwi (~(c << lz)); + unsigned HOST_WIDE_INT uc = c; + int middle_ones = clz_hwi (~(uc << lz)); if (tz + lz + middle_ones >= ones && (tz - lz) < HOST_BITS_PER_WIDE_INT && tz < HOST_BITS_PER_WIDE_INT) @@ -10479,7 +10483,7 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask) if (!IN_RANGE (pos_first_1, 1, HOST_BITS_PER_WIDE_INT-1)) return false; - middle_ones = clz_hwi (~c << pos_first_1); + middle_ones = clz_hwi ((~(unsigned HOST_WIDE_INT) c) << pos_first_1); middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_first_1)); if (pos_first_1 < HOST_BITS_PER_WIDE_INT && middle_ones + middle_zeros < HOST_BITS_PER_WIDE_INT @@ -10581,7 +10585,8 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c, int *num_insns) { /* li/lis; rldicX */ unsigned HOST_WIDE_INT imm = (c | ~mask); - imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift)); + if (shift != 0) + imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift)); count_or_emit_insn (temp, GEN_INT (imm)); if (shift != 0) diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 9c718ca..e31ee40 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -1969,7 +1969,7 @@ [(set (match_dup 0) (plus:GPR (match_dup 1) (match_dup 3))) (set (match_dup 0) (plus:GPR (match_dup 0) (match_dup 4)))] { - HOST_WIDE_INT val = INTVAL (operands[2]); + unsigned HOST_WIDE_INT val = UINTVAL (operands[2]); HOST_WIDE_INT low = sext_hwi (val, 16); HOST_WIDE_INT rest = trunc_int_for_mode (val - low, <MODE>mode); diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h index d760a7e..6becad1 100644 --- a/gcc/config/s390/s390-protos.h +++ b/gcc/config/s390/s390-protos.h @@ -128,6 +128,8 @@ extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx); extern void s390_expand_vec_init (rtx, rtx); extern rtx s390_expand_merge_perm_const (machine_mode, bool); extern void s390_expand_merge (rtx, rtx, rtx, bool); +extern void s390_expand_int_spaceship (rtx, rtx, rtx, rtx); +extern void s390_expand_fp_spaceship (rtx, rtx, rtx, rtx); extern rtx s390_build_signbit_mask (machine_mode); extern rtx s390_return_addr_rtx (int, rtx); extern rtx s390_back_chain_rtx (void); diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index abe551c..012b6db 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -8213,6 +8213,167 @@ s390_expand_atomic (machine_mode mode, enum rtx_code code, NULL_RTX, 1, OPTAB_DIRECT), 1); } +/* Expand integer op0 = op1 <=> op2, i.e., + op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : 1. + + Signedness is specified by op3. If op3 equals 1, then perform an unsigned + comparison, and if op3 equals -1, then perform a signed comparison. + + For integer comparisons we strive for a sequence like + CR[L] ; LHI ; LOCHIL ; LOCHIH + where the first three instructions fit into a group. */ + +void +s390_expand_int_spaceship (rtx op0, rtx op1, rtx op2, rtx op3) +{ + gcc_assert (op3 == const1_rtx || op3 == constm1_rtx); + + rtx cc, cond_lt, cond_gt; + machine_mode cc_mode; + machine_mode mode = GET_MODE (op1); + + /* Prior VXE3 emulate a 128-bit comparison by breaking it up into three + comparisons. First test the high halfs. In case they equal, then test + the low halfs. Finally, test for equality. Depending on the results + make use of LOCs. */ + if (mode == TImode && !TARGET_VXE3) + { + gcc_assert (TARGET_VX); + op1 + = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op1, TImode, 0)); + op2 + = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op2, TImode, 0)); + rtx lab = gen_label_rtx (); + rtx ccz = gen_rtx_REG (CCZmode, CC_REGNUM); + /* Compare high halfs for equality. + VEC[L]G op1, op2 sets + CC1 if high(op1) < high(op2) + and + CC2 if high(op1) > high(op2). */ + machine_mode cc_mode = op3 == const1_rtx ? CCUmode : CCSmode; + rtx lane0 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + emit_insn (gen_rtx_SET ( + gen_rtx_REG (cc_mode, CC_REGNUM), + gen_rtx_COMPARE (cc_mode, + gen_rtx_VEC_SELECT (DImode, op1, lane0), + gen_rtx_VEC_SELECT (DImode, op2, lane0)))); + s390_emit_jump (lab, gen_rtx_NE (CCZmode, ccz, const0_rtx)); + /* At this point we know that the high halfs equal. + VCHLGS op2, op1 sets CC1 if low(op1) < low(op2) */ + emit_insn (gen_rtx_PARALLEL ( + VOIDmode, + gen_rtvec (2, + gen_rtx_SET (gen_rtx_REG (CCVIHUmode, CC_REGNUM), + gen_rtx_COMPARE (CCVIHUmode, op2, op1)), + gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (V2DImode))))); + emit_label (lab); + emit_insn (gen_rtx_SET (op0, const1_rtx)); + emit_insn ( + gen_movsicc (op0, + gen_rtx_LTU (CCUmode, gen_rtx_REG (CCUmode, CC_REGNUM), + const0_rtx), + constm1_rtx, op0)); + /* Deal with the case where both halfs equal. */ + emit_insn (gen_rtx_PARALLEL ( + VOIDmode, + gen_rtvec (2, + gen_rtx_SET (gen_rtx_REG (CCVEQmode, CC_REGNUM), + gen_rtx_COMPARE (CCVEQmode, op1, op2)), + gen_rtx_SET (gen_reg_rtx (V2DImode), + gen_rtx_EQ (V2DImode, op1, op2))))); + emit_insn (gen_movsicc (op0, gen_rtx_EQ (CCZmode, ccz, const0_rtx), + const0_rtx, op0)); + return; + } + + if (mode == QImode || mode == HImode) + { + rtx_code extend = op3 == const1_rtx ? ZERO_EXTEND : SIGN_EXTEND; + op1 = simplify_gen_unary (extend, SImode, op1, mode); + op1 = force_reg (SImode, op1); + op2 = simplify_gen_unary (extend, SImode, op2, mode); + op2 = force_reg (SImode, op2); + mode = SImode; + } + + if (op3 == const1_rtx) + { + cc_mode = CCUmode; + cc = gen_rtx_REG (cc_mode, CC_REGNUM); + cond_lt = gen_rtx_LTU (mode, cc, const0_rtx); + cond_gt = gen_rtx_GTU (mode, cc, const0_rtx); + } + else + { + cc_mode = CCSmode; + cc = gen_rtx_REG (cc_mode, CC_REGNUM); + cond_lt = gen_rtx_LT (mode, cc, const0_rtx); + cond_gt = gen_rtx_GT (mode, cc, const0_rtx); + } + + emit_insn (gen_rtx_SET (cc, gen_rtx_COMPARE (cc_mode, op1, op2))); + emit_move_insn (op0, const0_rtx); + emit_insn (gen_movsicc (op0, cond_lt, constm1_rtx, op0)); + emit_insn (gen_movsicc (op0, cond_gt, const1_rtx, op0)); +} + +/* Expand floating-point op0 = op1 <=> op2, i.e., + op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : op1 > op2 ? 1 : 2. + + If op3 equals const0_rtx, then we are interested in the compare only (see + test spaceship-fp-4.c). Otherwise, op3 is a CONST_INT different than + const1_rtx and constm1_rtx which is used in order to set op0 for unordered. + + Emit a branch-only solution, i.e., let if-convert fold the branches into + LOCs if applicable. This has the benefit that the solution is also + applicable if we are only interested in the compare, i.e., if op3 equals + const0_rtx. + */ + +void +s390_expand_fp_spaceship (rtx op0, rtx op1, rtx op2, rtx op3) +{ + gcc_assert (op3 != const1_rtx && op3 != constm1_rtx); + + machine_mode mode = GET_MODE (op1); + machine_mode cc_mode = s390_select_ccmode (LTGT, op1, op2); + rtx cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM); + rtx cond_unordered = gen_rtx_UNORDERED (mode, cc_reg, const0_rtx); + rtx cond_eq = gen_rtx_EQ (mode, cc_reg, const0_rtx); + rtx cond_gt = gen_rtx_GT (mode, cc_reg, const0_rtx); + rtx_insn *insn; + rtx l_unordered = gen_label_rtx (); + rtx l_eq = gen_label_rtx (); + rtx l_gt = gen_label_rtx (); + rtx l_end = gen_label_rtx (); + + s390_emit_compare (VOIDmode, LTGT, op1, op2); + if (!flag_finite_math_only) + { + insn = s390_emit_jump (l_unordered, cond_unordered); + add_reg_br_prob_note (insn, profile_probability::very_unlikely ()); + } + insn = s390_emit_jump (l_eq, cond_eq); + add_reg_br_prob_note (insn, profile_probability::unlikely ()); + insn = s390_emit_jump (l_gt, cond_gt); + add_reg_br_prob_note (insn, profile_probability::even ()); + emit_move_insn (op0, constm1_rtx); + emit_jump (l_end); + emit_label (l_eq); + emit_move_insn (op0, const0_rtx); + emit_jump (l_end); + emit_label (l_gt); + emit_move_insn (op0, const1_rtx); + if (!flag_finite_math_only) + { + emit_jump (l_end); + emit_label (l_unordered); + rtx unord_val = op3 == const0_rtx ? const2_rtx : op3; + emit_move_insn (op0, unord_val); + } + emit_label (l_end); +} + /* This is called from dwarf2out.cc via TARGET_ASM_OUTPUT_DWARF_DTPREL. We need to emit DTP-relative relocations. */ diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 1edbfde..8cc48b0 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -1527,6 +1527,27 @@ operands[0] = SET_DEST (PATTERN (curr_insn)); }) +; Restrict spaceship optab to z13 or later since there we have +; LOAD HALFWORD IMMEDIATE ON CONDITION. + +(define_mode_iterator SPACESHIP_INT [(TI "TARGET_VX") DI SI HI QI]) +(define_expand "spaceship<mode>4" + [(match_operand:SI 0 "register_operand") + (match_operand:SPACESHIP_INT 1 "register_operand") + (match_operand:SPACESHIP_INT 2 "register_operand") + (match_operand:SI 3 "const_int_operand")] + "TARGET_Z13 && TARGET_64BIT" + "s390_expand_int_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;") + +(define_mode_iterator SPACESHIP_BFP [TF DF SF]) +(define_expand "spaceship<mode>4" + [(match_operand:SI 0 "register_operand") + (match_operand:SPACESHIP_BFP 1 "register_operand") + (match_operand:SPACESHIP_BFP 2 "register_operand") + (match_operand:SI 3 "const_int_operand")] + "TARGET_Z13 && TARGET_64BIT && TARGET_HARD_FLOAT" + "s390_expand_fp_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;") + ; (TF|DF|SF|TD|DD|SD) instructions |