aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/aarch64/aarch64-cores.def2
-rw-r--r--gcc/config/aarch64/aarch64-option-extensions.def12
-rw-r--r--gcc/config/aarch64/aarch64-protos.h1
-rw-r--r--gcc/config/aarch64/aarch64-sme.md12
-rw-r--r--gcc/config/aarch64/aarch64-sve-builtins-sme.def3
-rw-r--r--gcc/config/aarch64/aarch64-sve-builtins.cc3
-rw-r--r--gcc/config/aarch64/aarch64-sve.md276
-rw-r--r--gcc/config/aarch64/aarch64.cc93
-rw-r--r--gcc/config/aarch64/tuning_models/generic_armv9_a.h2
-rw-r--r--gcc/config/aarch64/tuning_models/olympus.h210
-rw-r--r--gcc/config/avr/avr-passes.cc139
-rw-r--r--gcc/config/avr/avr-passes.def8
-rw-r--r--gcc/config/avr/avr-protos.h1
-rw-r--r--gcc/config/avr/avr.cc26
-rw-r--r--gcc/config/avr/avr.opt4
-rw-r--r--gcc/config/avr/avr.opt.urls3
-rw-r--r--gcc/config/gcn/gcn-opts.h2
-rw-r--r--gcc/config/gcn/gcn-valu.md4
-rw-r--r--gcc/config/gcn/gcn.cc103
-rw-r--r--gcc/config/gcn/gcn.md40
-rw-r--r--gcc/config/i386/i386-features.cc2
-rw-r--r--gcc/config/i386/i386-modes.def2
-rw-r--r--gcc/config/i386/i386-options.cc35
-rw-r--r--gcc/config/i386/i386.cc96
-rw-r--r--gcc/config/i386/i386.md3
-rw-r--r--gcc/config/i386/sse.md13
-rw-r--r--gcc/config/loongarch/loongarch.h2
-rw-r--r--gcc/config/nvptx/nvptx.opt45
-rwxr-xr-xgcc/config/riscv/arch-canonicalize2
-rw-r--r--gcc/config/riscv/gen-riscv-mcpu-texi.cc43
-rw-r--r--gcc/config/riscv/gen-riscv-mtune-texi.cc41
-rw-r--r--gcc/config/riscv/riscv-v.cc2
-rw-r--r--gcc/config/riscv/riscv-vector-costs.cc28
-rw-r--r--gcc/config/riscv/riscv.cc1
-rw-r--r--gcc/config/riscv/t-riscv37
-rw-r--r--gcc/config/riscv/vector-iterators.md8
-rw-r--r--gcc/config/rs6000/rs6000.cc41
-rw-r--r--gcc/config/rs6000/rs6000.md2
-rw-r--r--gcc/config/s390/s390-protos.h2
-rw-r--r--gcc/config/s390/s390.cc161
-rw-r--r--gcc/config/s390/s390.md21
-rw-r--r--gcc/config/xtensa/xtensa.cc46
42 files changed, 1258 insertions, 319 deletions
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 8040409..6f11cc0 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -224,7 +224,7 @@ AARCH64_CORE("neoverse-v3ae", neoversev3ae, cortexa57, V9_2A, (SVE2_BITPERM, RNG
AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
/* NVIDIA ('N') cores. */
-AARCH64_CORE("olympus", olympus, cortexa57, V9_2A, (SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE, FAMINMAX, FP8FMA, FP8DOT2, FP8DOT4, LUT, SVE2_AES, SVE2_SHA3, SVE2_SM4), neoversev3, 0x4e, 0x10, -1)
+AARCH64_CORE("olympus", olympus, cortexa57, V9_2A, (SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE, FAMINMAX, FP8FMA, FP8DOT2, FP8DOT4, LUT, SVE2_AES, SVE2_SHA3, SVE2_SM4), olympus, 0x4e, 0x10, -1)
/* Armv9-A big.LITTLE processors. */
AARCH64_CORE("gb10", gb10, cortexa57, V9_2A, (SVE2_BITPERM, SVE2_AES, SVE2_SHA3, SVE2_SM4, MEMTAG, PROFILE), cortexx925, 0x41, AARCH64_BIG_LITTLE (0xd85, 0xd87), -1)
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 1c3e697..db88df0 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -128,7 +128,9 @@ AARCH64_OPT_FMV_EXTENSION("sha2", SHA2, (SIMD), (), (), "sha1 sha2")
AARCH64_FMV_FEATURE("sha3", SHA3, (SHA3))
-AARCH64_OPT_FMV_EXTENSION("aes", AES, (SIMD), (), (), "aes")
+AARCH64_OPT_EXTENSION("aes", AES, (SIMD), (), (), "aes")
+
+AARCH64_FMV_FEATURE("aes", PMULL, (AES))
/* +nocrypto disables AES, SHA2 and SM4, and anything that depends on them
(such as SHA3 and the SVE2 crypto extensions). */
@@ -171,8 +173,6 @@ AARCH64_OPT_FMV_EXTENSION("i8mm", I8MM, (SIMD), (), (), "i8mm")
instructions. */
AARCH64_OPT_FMV_EXTENSION("bf16", BF16, (FP), (SIMD), (), "bf16")
-AARCH64_FMV_FEATURE("rpres", RPRES, ())
-
AARCH64_OPT_FMV_EXTENSION("sve", SVE, (SIMD, F16, FCMA), (), (), "sve")
/* This specifically does not imply +sve. */
@@ -190,7 +190,7 @@ AARCH64_OPT_FMV_EXTENSION("sve2", SVE2, (SVE), (), (), "sve2")
AARCH64_OPT_EXTENSION("sve2-aes", SVE2_AES, (SVE2, AES), (), (), "sveaes")
-AARCH64_FMV_FEATURE("sve2-aes", SVE_AES, (SVE2_AES))
+AARCH64_FMV_FEATURE("sve2-aes", SVE_PMULL128, (SVE2_AES))
AARCH64_OPT_EXTENSION("sve2-bitperm", SVE2_BITPERM, (SVE2), (), (),
"svebitperm")
@@ -245,9 +245,9 @@ AARCH64_OPT_EXTENSION("sme-b16b16", SME_B16B16, (SME2, SVE_B16B16), (), (), "sme
AARCH64_OPT_EXTENSION("sme-f16f16", SME_F16F16, (SME2), (), (), "smef16f16")
-AARCH64_OPT_EXTENSION("mops", MOPS, (), (), (), "mops")
+AARCH64_OPT_FMV_EXTENSION("mops", MOPS, (), (), (), "mops")
-AARCH64_OPT_EXTENSION("cssc", CSSC, (), (), (), "cssc")
+AARCH64_OPT_FMV_EXTENSION("cssc", CSSC, (), (), (), "cssc")
AARCH64_OPT_EXTENSION("cmpbr", CMPBR, (), (), (), "cmpbr")
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e946e8d..38c307c 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1031,6 +1031,7 @@ rtx aarch64_pfalse_reg (machine_mode);
bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
rtx aarch64_sve_packed_pred (machine_mode);
rtx aarch64_sve_fp_pred (machine_mode, rtx *);
+rtx aarch64_sve_emit_masked_fp_pred (machine_mode, rtx);
void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
bool aarch64_expand_maskloadstore (rtx *, machine_mode);
void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md
index 6b3f439..6b1a747 100644
--- a/gcc/config/aarch64/aarch64-sme.md
+++ b/gcc/config/aarch64/aarch64-sme.md
@@ -62,6 +62,10 @@
;; (b) they are sometimes used conditionally, particularly in streaming-
;; compatible code.
;;
+;; To prevent the latter from upsetting the assembler, we emit the literal
+;; encodings of "SMSTART SM" and "SMSTOP SM" when compiling without
+;; TARGET_SME.
+;;
;; =========================================================================
;; -------------------------------------------------------------------------
@@ -161,7 +165,9 @@
(clobber (reg:VNx16BI P14_REGNUM))
(clobber (reg:VNx16BI P15_REGNUM))]
""
- "smstart\tsm"
+ {
+ return TARGET_SME ? "smstart\tsm" : ".inst 0xd503437f // smstart sm";
+ }
)
;; Turn off streaming mode. This clobbers all SVE state.
@@ -196,7 +202,9 @@
(clobber (reg:VNx16BI P14_REGNUM))
(clobber (reg:VNx16BI P15_REGNUM))]
""
- "smstop\tsm"
+ {
+ return TARGET_SME ? "smstop\tsm" : ".inst 0xd503427f // smstop sm";
+ }
)
;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sme.def b/gcc/config/aarch64/aarch64-sve-builtins-sme.def
index 8e6aadc..117b70e 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sme.def
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sme.def
@@ -92,7 +92,8 @@ DEF_SME_FUNCTION (svstr_zt, str_zt, none, none)
DEF_SME_FUNCTION (svzero_zt, inherent_zt, none, none)
#undef REQUIRED_EXTENSIONS
-#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 && AARCH64_FL_FAMINMAX)
+#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 \
+ | AARCH64_FL_FAMINMAX)
DEF_SME_FUNCTION_GS (svamin, binary_opt_single_n, all_float, x24, none)
DEF_SME_FUNCTION_GS (svamax, binary_opt_single_n, all_float, x24, none)
#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 2b627a9..01833a8 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -4004,7 +4004,8 @@ rtx
function_expander::get_reg_target ()
{
machine_mode target_mode = result_mode ();
- if (!possible_target || GET_MODE (possible_target) != target_mode)
+ if (!possible_target
+ || !register_operand (possible_target, target_mode))
possible_target = gen_reg_rtx (target_mode);
return possible_target;
}
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index b252eef..80a3288 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5605,18 +5605,21 @@
;; Predicated floating-point operations with merging.
(define_expand "@cond_<optab><mode>"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
- (match_operand:SVE_FULL_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
+ (match_operand:SVE_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
+ (match_operand:SVE_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
SVE_COND_FP_BINARY)
- (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
+ {
+ operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]);
+ }
)
;; Predicated floating-point operations, merging with the first input.
@@ -5644,14 +5647,14 @@
)
(define_insn "*cond_<optab><mode>_2_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 2)]
UNSPEC_SEL))]
@@ -5687,14 +5690,14 @@
)
(define_insn "*cond_<optab><mode>_2_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
SVE_COND_FP_BINARY_I1)
(match_dup 2)]
UNSPEC_SEL))]
@@ -5730,14 +5733,14 @@
)
(define_insn "*cond_<optab><mode>_3_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 3)]
UNSPEC_SEL))]
@@ -5794,16 +5797,16 @@
)
(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_BINARY)
- (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
&& (<supports_bf16> || !<is_bf16>)
@@ -5868,16 +5871,16 @@
)
(define_insn_and_rewrite "*cond_<optab><mode>_any_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
SVE_COND_FP_BINARY_I1)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
{@ [ cons: =0 , 1 , 2 , 4 ]
@@ -5953,14 +5956,14 @@
)
(define_insn "*cond_add<mode>_2_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
UNSPEC_COND_FADD)
(match_dup 2)]
UNSPEC_SEL))]
@@ -6015,16 +6018,16 @@
)
(define_insn_and_rewrite "*cond_add<mode>_any_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
UNSPEC_COND_FADD)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
{@ [ cons: =0 , 1 , 2 , 3 , 4 ]
@@ -6266,14 +6269,14 @@
)
(define_insn "*cond_sub<mode>_3_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
+ (match_operand:SVE_F 3 "register_operand")]
UNSPEC_COND_FSUB)
(match_dup 3)]
UNSPEC_SEL))]
@@ -6323,16 +6326,16 @@
)
(define_insn_and_rewrite "*cond_sub<mode>_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
+ (match_operand:SVE_F 3 "register_operand")]
UNSPEC_COND_FSUB)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[3], operands[4])"
{@ [ cons: =0 , 1 , 3 , 4 ]
@@ -6913,7 +6916,7 @@
;; Predicate AND. We can reuse one of the inputs as the GP.
;; Doubling the second operand is the preferred implementation
;; of the MOV alias, so we use that instead of %1/z, %1, %2.
-(define_insn "and<mode>3"
+(define_insn "@and<mode>3"
[(set (match_operand:PRED_ALL 0 "register_operand")
(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand")
(match_operand:PRED_ALL 2 "register_operand")))]
@@ -7595,29 +7598,29 @@
;; Unpredicated floating-point ternary operations.
(define_expand "<optab><mode>4"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 4)
- (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 1 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_dup 5)
+ (match_operand:SVE_F_B16B16 1 "register_operand")
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_TERNARY))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{
- operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[4] = aarch64_sve_fp_pred (<MODE>mode, &operands[5]);
}
)
;; Predicated floating-point ternary operations.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 5 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{@ [ cons: =0 , 1 , %2 , 3 , 4 ; attrs: movprfx , is_rev ]
@@ -7631,17 +7634,17 @@
;; Predicated floating-point ternary operations with merging.
(define_expand "@cond_<optab><mode>"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
- (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{
@@ -7649,20 +7652,22 @@
second of the two. */
if (rtx_equal_p (operands[3], operands[5]))
std::swap (operands[2], operands[3]);
+
+ operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]);
})
;; Predicated floating-point ternary operations, merging with the
;; first input.
(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "register_operand")
+ (match_operand:SVE_F 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 2)]
UNSPEC_SEL))]
@@ -7678,15 +7683,15 @@
)
(define_insn "*cond_<optab><mode>_2_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "register_operand")
+ (match_operand:SVE_F 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 2)]
UNSPEC_SEL))]
@@ -7700,15 +7705,15 @@
;; Predicated floating-point ternary operations, merging with the
;; third input.
(define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 4)]
UNSPEC_SEL))]
@@ -7724,15 +7729,15 @@
)
(define_insn "*cond_<optab><mode>_4_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 4)]
UNSPEC_SEL))]
@@ -7746,17 +7751,17 @@
;; Predicated floating-point ternary operations, merging with an
;; independent value.
(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_operand 6)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
- (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
&& (<supports_bf16> || !<is_bf16>)
@@ -7792,17 +7797,17 @@
)
(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
- (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
&& (<supports_bf16> || !<is_bf16>)
@@ -8201,20 +8206,23 @@
;;
;; For unpacked vectors, it doesn't really matter whether SEL uses the
;; the container size or the element size. If SEL used the container size,
-;; it would ignore undefined bits of the predicate but would copy the
-;; upper (undefined) bits of each container along with the defined bits.
-;; If SEL used the element size, it would use undefined bits of the predicate
-;; to select between undefined elements in each input vector. Thus the only
-;; difference is whether the undefined bits in a container always come from
-;; the same input as the defined bits, or whether the choice can vary
-;; independently of the defined bits.
+;; it would would copy the upper (undefined) bits of each container along
+;; with the corresponding defined bits. If SEL used the element size,
+;; it would use separate predicate bits to select between the undefined
+;; elements in each input vector; these seperate predicate bits might
+;; themselves be undefined, depending on the mode of the predicate.
+;;
+;; Thus the only difference is whether the undefined bits in a container
+;; always come from the same input as the defined bits, or whether the
+;; choice can vary independently of the defined bits.
;;
;; For the other instructions, using the element size is more natural,
;; so we do that for SEL as well.
+;;
(define_insn "*vcond_mask_<mode><vpred>"
[(set (match_operand:SVE_ALL 0 "register_operand")
(unspec:SVE_ALL
- [(match_operand:<VPRED> 3 "register_operand")
+ [(match_operand:<VPRED> 3 "aarch64_predicate_operand")
(match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm")
(match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4d9d83d..f4a2062 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -430,6 +430,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] =
#include "tuning_models/neoversev2.h"
#include "tuning_models/neoversev3.h"
#include "tuning_models/neoversev3ae.h"
+#include "tuning_models/olympus.h"
#include "tuning_models/a64fx.h"
#include "tuning_models/fujitsu_monaka.h"
@@ -3932,6 +3933,33 @@ aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness)
return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode));
}
+/* PRED is a predicate that governs an operation on DATA_MODE. If DATA_MODE
+ is a partial vector mode, and if exceptions must be suppressed for its
+ undefined elements, convert PRED from a container-level predicate to
+ an element-level predicate and ensure that the undefined elements
+ are inactive. Make no changes otherwise.
+
+ Return the resultant predicate. */
+rtx
+aarch64_sve_emit_masked_fp_pred (machine_mode data_mode, rtx pred)
+{
+ unsigned int vec_flags = aarch64_classify_vector_mode (data_mode);
+ if (flag_trapping_math && (vec_flags & VEC_PARTIAL))
+ {
+ /* Generate an element-level mask. */
+ rtx mask = aarch64_sve_packed_pred (data_mode);
+ machine_mode pmode = GET_MODE (mask);
+
+ /* Apply the existing predicate. */
+ rtx dst = gen_reg_rtx (pmode);
+ emit_insn (gen_and3 (pmode, dst, mask,
+ gen_lowpart (pmode, pred)));
+ return dst;
+ }
+
+ return pred;
+}
+
/* Emit a comparison CMP between OP0 and OP1, both of which have mode
DATA_MODE, and return the result in a predicate of mode PRED_MODE.
Use TARGET as the target register if nonnull and convenient. */
@@ -17165,8 +17193,8 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
&& STMT_VINFO_DATA_REF (stmt_info))
{
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
- if (stmt_info
- && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES)
+ if (node
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES)
return DR_GROUP_SIZE (stmt_info);
}
return 0;
@@ -17437,8 +17465,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
for each element. We therefore need to divide the full-instruction
cost by the number of elements in the vector. */
if (kind == scalar_load
+ && node
&& sve_costs
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
{
unsigned int nunits = vect_nunits_for_cost (vectype);
/* Test for VNx2 modes, which have 64-bit containers. */
@@ -17450,8 +17479,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
/* Detect cases in which a scalar_store is really storing one element
in a scatter operation. */
if (kind == scalar_store
+ && node
&& sve_costs
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
return sve_costs->scatter_store_elt_cost;
/* Detect cases in which vec_to_scalar represents an in-loop reduction. */
@@ -17707,7 +17737,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
if (stmt_info
&& kind == vec_to_scalar
&& (m_vec_flags & VEC_ADVSIMD)
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
{
auto dr = STMT_VINFO_DATA_REF (stmt_info);
tree dr_ref = DR_REF (dr);
@@ -17720,7 +17750,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
{
if (gimple_vuse (SSA_NAME_DEF_STMT (offset)))
{
- if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type)
+ if (SLP_TREE_TYPE (node) == load_vec_info_type)
ops->loads += count - 1;
else
/* Stores want to count both the index to array and data to
@@ -17822,7 +17852,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
if (stmt_info
&& sve_issue
&& (kind == scalar_load || kind == scalar_store)
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
{
unsigned int pairs = CEIL (count, 2);
ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
@@ -17977,9 +18007,10 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
/* Check if we've seen an SVE gather/scatter operation and which size. */
if (kind == scalar_load
+ && node
&& vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype))
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
{
const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
if (sve_costs)
@@ -20481,6 +20512,8 @@ aarch64_compare_version_priority (tree decl1, tree decl2)
unsigned long _size; // Size of the struct, so it can grow.
unsigned long _hwcap;
unsigned long _hwcap2;
+ unsigned long _hwcap3;
+ unsigned long _hwcap4;
}
*/
@@ -20497,14 +20530,24 @@ build_ifunc_arg_type ()
tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
get_identifier ("_hwcap2"),
long_unsigned_type_node);
+ tree field4 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+ get_identifier ("_hwcap3"),
+ long_unsigned_type_node);
+ tree field5 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+ get_identifier ("_hwcap4"),
+ long_unsigned_type_node);
DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
+ DECL_FIELD_CONTEXT (field4) = ifunc_arg_type;
+ DECL_FIELD_CONTEXT (field5) = ifunc_arg_type;
TYPE_FIELDS (ifunc_arg_type) = field1;
DECL_CHAIN (field1) = field2;
DECL_CHAIN (field2) = field3;
+ DECL_CHAIN (field3) = field4;
+ DECL_CHAIN (field4) = field5;
layout_type (ifunc_arg_type);
@@ -31963,9 +32006,43 @@ aarch64_test_sysreg_encoding_clashes (void)
static void
aarch64_test_sve_folding ()
{
+ aarch64_target_switcher switcher (AARCH64_FL_SVE);
+
tree res = fold_unary (BIT_NOT_EXPR, ssizetype,
ssize_int (poly_int64 (1, 1)));
ASSERT_TRUE (operand_equal_p (res, ssize_int (poly_int64 (-2, -1))));
+
+ auto build_v16bi = [](bool a, bool b)
+ {
+ rtx_vector_builder builder (VNx16BImode, 2, 1);
+ builder.quick_push (a ? const1_rtx : const0_rtx);
+ builder.quick_push (b ? const1_rtx : const0_rtx);
+ return builder.build ();
+ };
+ rtx v16bi_10 = build_v16bi (1, 0);
+ rtx v16bi_01 = build_v16bi (0, 1);
+
+ for (auto mode : { VNx8BImode, VNx4BImode, VNx2BImode })
+ {
+ rtx reg = gen_rtx_REG (mode, LAST_VIRTUAL_REGISTER + 1);
+ rtx subreg = lowpart_subreg (VNx16BImode, reg, mode);
+ rtx and1 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_10);
+ ASSERT_EQ (lowpart_subreg (mode, and1, VNx16BImode), reg);
+ rtx and0 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_01);
+ ASSERT_EQ (lowpart_subreg (mode, and0, VNx16BImode), CONST0_RTX (mode));
+
+ rtx ior1 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_10);
+ ASSERT_EQ (lowpart_subreg (mode, ior1, VNx16BImode), CONSTM1_RTX (mode));
+ rtx ior0 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_01);
+ ASSERT_EQ (lowpart_subreg (mode, ior0, VNx16BImode), reg);
+
+ rtx xor1 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_10);
+ ASSERT_RTX_EQ (lowpart_subreg (mode, xor1, VNx16BImode),
+ lowpart_subreg (mode, gen_rtx_NOT (VNx16BImode, subreg),
+ VNx16BImode));
+ rtx xor0 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_01);
+ ASSERT_EQ (lowpart_subreg (mode, xor0, VNx16BImode), reg);
+ }
}
/* Run all target-specific selftests. */
diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
index f76a250..9eb1a20 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -26,7 +26,7 @@
static const struct cpu_addrcost_table generic_armv9_a_addrcost_table =
{
{
- 1, /* hi */
+ 0, /* hi */
0, /* si */
0, /* di */
1, /* ti */
diff --git a/gcc/config/aarch64/tuning_models/olympus.h b/gcc/config/aarch64/tuning_models/olympus.h
new file mode 100644
index 0000000..268789d
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/olympus.h
@@ -0,0 +1,210 @@
+/* Tuning model description for the NVIDIA Olympus core.
+ Copyright The GNU Toolchain Authors.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef GCC_AARCH64_H_OLYMPUS
+#define GCC_AARCH64_H_OLYMPUS
+
+#include "generic.h"
+
+static struct cpu_regmove_cost olympus_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Spilling to int<->fp instead of memory is recommended so set
+ realistic costs compared to memmov_cost. */
+ 3, /* GP2FP */
+ 3, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static advsimd_vec_cost olympus_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 2, /* ld2_st2_permute_cost */
+ 2, /* ld3_st3_permute_cost */
+ 3, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 5, /* reduc_i8_cost */
+ 3, /* reduc_i16_cost */
+ 3, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 4, /* reduc_f16_cost */
+ 4, /* reduc_f32_cost */
+ 4, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ 8, /* vec_to_scalar_cost */
+ 4, /* scalar_to_vec_cost */
+ 6, /* align_load_cost */
+ 6, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static sve_vec_cost olympus_sve_vector_cost =
+{
+ {
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 2, /* ld2_st2_permute_cost */
+ 3, /* ld3_st3_permute_cost */
+ 3, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 9, /* reduc_i8_cost */
+ 8, /* reduc_i16_cost */
+ 6, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 8, /* reduc_f16_cost */
+ 6, /* reduc_f32_cost */
+ 4, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ 8, /* vec_to_scalar_cost */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 6, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+ },
+ 3, /* clast_cost */
+ 10, /* fadda_f16_cost */
+ 6, /* fadda_f32_cost */
+ 4, /* fadda_f64_cost */
+ 14, /* gather_load_x32_cost */
+ 12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
+ 1 /* scatter_store_elt_cost */
+};
+
+static aarch64_scalar_vec_issue_info olympus_scalar_issue_info =
+{
+ 4, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 8, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+};
+
+static aarch64_advsimd_vec_issue_info olympus_advsimd_issue_info =
+{
+ {
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 6, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+};
+
+static aarch64_sve_vec_issue_info olympus_sve_issue_info =
+{
+ {
+ {
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 6, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+ },
+ 2, /* pred_ops_per_cycle */
+ 1, /* while_pred_ops */
+ 0, /* int_cmp_pred_ops */
+ 0, /* fp_cmp_pred_ops */
+ 1, /* gather_scatter_pair_general_ops */
+ 1 /* gather_scatter_pair_pred_ops */
+};
+
+static aarch64_vec_issue_info olympus_vec_issue_info =
+{
+ &olympus_scalar_issue_info,
+ &olympus_advsimd_issue_info,
+ &olympus_sve_issue_info
+};
+
+/* Olympus costs for vector insn classes. */
+static struct cpu_vector_cost olympus_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 2, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &olympus_advsimd_vector_cost, /* advsimd */
+ &olympus_sve_vector_cost, /* sve */
+ &olympus_vec_issue_info /* issue_info */
+};
+
+/* Olympus prefetch settings (which disable prefetch). */
+static cpu_prefetch_tune olympus_prefetch_tune =
+{
+ 0, /* num_slots */
+ -1, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ -1, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static struct tune_params olympus_tunings =
+{
+ &cortexa76_extra_costs,
+ &generic_armv9_a_addrcost_table,
+ &olympus_regmove_cost,
+ &olympus_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_128, /* sve_width */
+ { 4, /* load_int. */
+ 1, /* store_int. */
+ 6, /* load_fp. */
+ 3, /* store_fp. */
+ 5, /* load_pred. */
+ 1 /* store_pred. */
+ }, /* memmov_cost. */
+ 10, /* issue_rate */
+ AARCH64_FUSE_NEOVERSE_BASE, /* fusible_ops */
+ "32:16", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 8, /* int_reassoc_width. */
+ 6, /* fp_reassoc_width. */
+ 4, /* fma_reassoc_width. */
+ 6, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_BASE
+ | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+ | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+ | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
+ &olympus_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_OLYMPUS. */
diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc
index 6a88a27..69df6d2 100644
--- a/gcc/config/avr/avr-passes.cc
+++ b/gcc/config/avr/avr-passes.cc
@@ -4843,6 +4843,137 @@ avr_pass_fuse_add::execute1 (function *func)
//////////////////////////////////////////////////////////////////////////////
+// Fuse 2 move insns after combine.
+
+static const pass_data avr_pass_data_2moves =
+{
+ RTL_PASS, // type
+ "", // name (will be patched)
+ OPTGROUP_NONE, // optinfo_flags
+ TV_DF_SCAN, // tv_id
+ 0, // properties_required
+ 0, // properties_provided
+ 0, // properties_destroyed
+ 0, // todo_flags_start
+ 0 // todo_flags_finish
+};
+
+class avr_pass_2moves : public rtl_opt_pass
+{
+public:
+ avr_pass_2moves (gcc::context *ctxt, const char *name)
+ : rtl_opt_pass (avr_pass_data_2moves, ctxt)
+ {
+ this->name = name;
+ }
+
+ unsigned int execute (function *func) final override
+ {
+ if (optimize && avropt_fuse_move2)
+ {
+ bool changed = false;
+ basic_block bb;
+
+ FOR_EACH_BB_FN (bb, func)
+ {
+ changed |= optimize_2moves_bb (bb);
+ }
+
+ if (changed)
+ {
+ df_note_add_problem ();
+ df_analyze ();
+ }
+ }
+
+ return 0;
+ }
+
+ bool optimize_2moves (rtx_insn *, rtx_insn *);
+ bool optimize_2moves_bb (basic_block);
+}; // avr_pass_2moves
+
+bool
+avr_pass_2moves::optimize_2moves_bb (basic_block bb)
+{
+ bool changed = false;
+ rtx_insn *insn1 = nullptr;
+ rtx_insn *insn2 = nullptr;
+ rtx_insn *curr;
+
+ FOR_BB_INSNS (bb, curr)
+ {
+ if (insn1 && INSN_P (insn1)
+ && insn2 && INSN_P (insn2))
+ changed |= optimize_2moves (insn1, insn2);
+
+ insn1 = insn2;
+ insn2 = curr;
+ }
+
+ return changed;
+}
+
+bool
+avr_pass_2moves::optimize_2moves (rtx_insn *insn1, rtx_insn *insn2)
+{
+ bool good = false;
+ bool bad = false;
+ rtx set1, dest1, src1;
+ rtx set2, dest2, src2;
+
+ if ((set1 = single_set (insn1))
+ && (set2 = single_set (insn2))
+ && (src1 = SET_SRC (set1))
+ && REG_P (src2 = SET_SRC (set2))
+ && REG_P (dest1 = SET_DEST (set1))
+ && REG_P (dest2 = SET_DEST (set2))
+ && rtx_equal_p (dest1, src2)
+ // Now we have:
+ // insn1: dest1 = src1
+ // insn2: dest2 = dest1
+ && REGNO (dest1) >= FIRST_PSEUDO_REGISTER
+ // Paranoia.
+ && GET_CODE (PATTERN (insn1)) != PARALLEL
+ && GET_CODE (PATTERN (insn2)) != PARALLEL
+ && (rtx_equal_p (dest2, src1)
+ || !reg_overlap_mentioned_p (dest2, src1)))
+ {
+ avr_dump ("\n;; Found 2moves:\n%r\n%r\n", insn1, insn2);
+ avr_dump (";; reg %d: insn uses uids:", REGNO (dest1));
+
+ // Go check that dest1 is used exactly once, namely by insn2.
+
+ df_ref use = DF_REG_USE_CHAIN (REGNO (dest1));
+ for (; use; use = DF_REF_NEXT_REG (use))
+ {
+ rtx_insn *user = DF_REF_INSN (use);
+ avr_dump (" %d", INSN_UID (user));
+ good |= INSN_UID (user) == INSN_UID (insn2);
+ bad |= INSN_UID (user) != INSN_UID (insn2);
+ }
+ avr_dump (".\n");
+
+ if (good && !bad
+ // Propagate src1 to insn2:
+ // insn1: # Deleted
+ // insn2: dest2 = src1
+ && validate_change (insn2, &SET_SRC (set2), src1, false))
+ {
+ SET_INSN_DELETED (insn1);
+ return true;
+ }
+ }
+
+ if (good && !bad)
+ avr_dump (";; Failed\n");
+
+ return false;
+}
+
+
+
+//////////////////////////////////////////////////////////////////////////////
// Split insns with nonzero_bits() after combine.
static const pass_data avr_pass_data_split_nzb =
@@ -5704,6 +5835,14 @@ make_avr_pass_casesi (gcc::context *ctxt)
return new avr_pass_casesi (ctxt, "avr-casesi");
}
+// Optimize 2 consecutive moves after combine.
+
+rtl_opt_pass *
+make_avr_pass_2moves (gcc::context *ctxt)
+{
+ return new avr_pass_2moves (ctxt, "avr-2moves");
+}
+
rtl_opt_pass *
make_avr_pass_split_nzb (gcc::context *ctxt)
{
diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def
index eb60a93..d668c7f 100644
--- a/gcc/config/avr/avr-passes.def
+++ b/gcc/config/avr/avr-passes.def
@@ -74,6 +74,14 @@ INSERT_PASS_BEFORE (pass_free_cfg, 1, avr_pass_recompute_notes);
INSERT_PASS_AFTER (pass_expand, 1, avr_pass_casesi);
+/* Insn combine may come up with superfluous reg-reg moves, where the combine
+ people say that these are no problem since reg-alloc is supposed to optimize
+ them. The issue is that the lower-subreg pass sitting between combine and
+ reg-alloc may split such moves, coming up with a zoo of subregs which are
+ only handled poorly by the register allocator. */
+
+INSERT_PASS_AFTER (pass_combine, 1, avr_pass_2moves);
+
/* Some combine insns have nonzero_bits() in their condition, though insns
should not use such stuff in their condition. Therefore, we split such
insn into something without nonzero_bits() in their condition right after
diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index ca30136..37911e7 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -208,6 +208,7 @@ extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *);
extern rtl_opt_pass *make_avr_pass_ifelse (gcc::context *);
extern rtl_opt_pass *make_avr_pass_split_nzb (gcc::context *);
extern rtl_opt_pass *make_avr_pass_split_after_peephole2 (gcc::context *);
+extern rtl_opt_pass *make_avr_pass_2moves (gcc::context *);
#ifdef RTX_CODE
extern bool avr_casei_sequence_check_operands (rtx *xop);
extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands);
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index c469297..1fb59b6 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -14418,6 +14418,13 @@ avr_output_addr_vec (rtx_insn *labl, rtx table)
// Output the label that precedes the table.
ASM_OUTPUT_ALIGN (stream, 1);
+
+ char s_labl[40];
+ targetm.asm_out.generate_internal_label (s_labl, "L",
+ CODE_LABEL_NUMBER (labl));
+ ASM_OUTPUT_TYPE_DIRECTIVE (stream, s_labl,
+ AVR_HAVE_JMP_CALL ? "object" : "function");
+
targetm.asm_out.internal_label (stream, "L", CODE_LABEL_NUMBER (labl));
// Output the table's content.
@@ -14984,10 +14991,11 @@ avr_addr_space_convert (rtx src, tree type_old, tree type_new)
/* Linearize memory: RAM has bit 23 set. When as_new = __flashx then
this is basically UB since __flashx mistreats RAM addresses, but there
- is no way to bail out. (Though -Waddr-space-convert will tell.) */
+ is no way to bail out. (Though -Waddr-space-convert will tell.)
+ ...but PR121277 is confusing, in particular when NULL is coming in. */
int msb = ADDR_SPACE_GENERIC_P (as_old)
- ? 0x80
+ ? as_new == ADDR_SPACE_MEMX ? 0x80 : 0x00
: avr_addrspace[as_old].segment;
src = force_reg (Pmode, src);
@@ -15085,10 +15093,16 @@ avr_convert_to_type (tree type, tree expr)
const char *name_old = avr_addrspace[as_old].name;
const char *name_new = avr_addrspace[as_new].name;
- warning (OPT_Waddr_space_convert,
- "conversion from address space %qs to address space %qs",
- ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old,
- ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new);
+ // Be relaxed when NULL is used, and when 0x0 stands for
+ // address 0x0.
+ bool nowarn = (expr == null_pointer_node
+ && (as_new == ADDR_SPACE_FLASHX
+ || as_new == ADDR_SPACE_FLASH));
+ if (!nowarn)
+ warning (OPT_Waddr_space_convert,
+ "conversion from address space %qs to address space %qs",
+ ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old,
+ ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new);
return fold_build1_loc (loc, ADDR_SPACE_CONVERT_EXPR, type, expr);
}
diff --git a/gcc/config/avr/avr.opt b/gcc/config/avr/avr.opt
index 9883119..7f6f18c 100644
--- a/gcc/config/avr/avr.opt
+++ b/gcc/config/avr/avr.opt
@@ -164,6 +164,10 @@ mfuse-move=
Target Joined RejectNegative UInteger Var(avropt_fuse_move) Init(0) Optimization IntegerRange(0, 23)
-mfuse-move=<0,23> Optimization. Run a post-reload pass that tweaks move instructions.
+mfuse-move2
+Target Var(avropt_fuse_move2) Init(0) Optimization
+Optimization. Fuse some move insns after insn combine.
+
mabsdata
Target Mask(ABSDATA)
Assume that all data in static storage can be accessed by LDS / STS instructions. This option is only useful for reduced Tiny devices like ATtiny40.
diff --git a/gcc/config/avr/avr.opt.urls b/gcc/config/avr/avr.opt.urls
index 662fdee..87c26b2 100644
--- a/gcc/config/avr/avr.opt.urls
+++ b/gcc/config/avr/avr.opt.urls
@@ -92,6 +92,9 @@ UrlSuffix(gcc/AVR-Options.html#index-mfuse-move)
mfuse-move=
UrlSuffix(gcc/AVR-Options.html#index-mfuse-move)
+mfuse-move2
+UrlSuffix(gcc/AVR-Options.html#index-mfuse-move2)
+
mabsdata
UrlSuffix(gcc/AVR-Options.html#index-mabsdata)
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index fe68678..0287400 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -92,6 +92,8 @@ enum hsaco_attr_type
/* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag
for non-scalar memory operations. The string starts on purpose with a space.
Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used.
+ Note: on atomics, glc/sc0 denotes whether the pre-op operation should
+ be used.
CDNA3 also uses 'nt' instead of 'slc' and 'sc1' instead of 'scc'; however,
there is no non-scalar user so far. */
#define TARGET_GLC_NAME (TARGET_CDNA3 ? " sc0" : " glc")
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 0994329..a34d2e3 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -3938,6 +3938,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
@@ -3992,6 +3993,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
@@ -4050,6 +4052,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,yes,yes")])
@@ -4073,6 +4076,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,yes,yes")])
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 8959118..5ffeb23 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -54,6 +54,7 @@
#include "gimple.h"
#include "cgraph.h"
#include "case-cfn-macros.h"
+#include "opts.h"
/* This file should be included last. */
#include "target-def.h"
@@ -183,6 +184,11 @@ gcn_option_override (void)
if (flag_sram_ecc == HSACO_ATTR_DEFAULT)
flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default;
+
+ /* TODO: This seems to produce tighter loops, but the testsuites expects it
+ to be set to '2', so I'll leave it default for now.
+ SET_OPTION_IF_UNSET (&global_options, &global_options_set,
+ param_vect_partial_vector_usage, 1); */
}
/* }}} */
@@ -5789,45 +5795,19 @@ gcn_libc_has_function (enum function_class fn_class,
return bsd_libc_has_function (fn_class, type);
}
-/* }}} */
-/* {{{ md_reorg pass. */
-
-/* Identify V_CMPX from the "type" attribute;
- note: this will also match 'v_cmp %E1 vcc'. */
+/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */
static bool
-gcn_cmpx_insn_p (attr_type type)
+gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode),
+ int ARG_UNUSED (scale),
+ unsigned int ARG_UNUSED (group_size))
{
- switch (type)
- {
- case TYPE_VOPC:
- return true;
- case TYPE_MUBUF:
- case TYPE_MTBUF:
- case TYPE_FLAT:
- case TYPE_VOP3P_MAI:
- case TYPE_UNKNOWN:
- case TYPE_SOP1:
- case TYPE_SOP2:
- case TYPE_SOPK:
- case TYPE_SOPC:
- case TYPE_SOPP:
- case TYPE_SMEM:
- case TYPE_DS:
- case TYPE_VOP2:
- case TYPE_VOP1:
- case TYPE_VOP3A:
- case TYPE_VOP3B:
- case TYPE_VOP_SDWA:
- case TYPE_VOP_DPP:
- case TYPE_MULT:
- case TYPE_VMULT:
- return false;
- }
- gcc_unreachable ();
- return false;
+ return true;
}
+/* }}} */
+/* {{{ md_reorg pass. */
+
/* Identify VMEM instructions from their "type" attribute. */
static bool
@@ -6356,19 +6336,59 @@ gcn_md_reorg (void)
reg_class_contents[(int)VCC_CONDITIONAL_REG])))
nops_rqd = ivccwait - prev_insn->age;
+ /* NOTE: The following condition for adding wait state exists, but
+ GCC does not access the special registers using their SGPR#.
+ Thus, no action is required here. The following wait-state
+ condition exists at least for VEGA/gfx900+ to CDNA3:
+ Mixed use of VCC: alias vs. SGPR# - v_readlane,
+ v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale
+ followed by VALU reads VCC as constant requires 1 wait state.
+ (As carry-in, it requires none.)
+ [VCC can be accessed by name or logical SGPR that holds it.] */
+
+ /* Testing indicates that CDNA3 requires an s_nop between
+ e.g. 'v_cmp_eq_u64 vcc, v[4:5], v[8:9]' and 'v_mov_b32 v0, vcc_lo'.
+ Thus: add it between v_cmp writing VCC and VALU read of VCC. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && iunit == UNIT_VECTOR
+ && (hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int)VCC_CONDITIONAL_REG]))
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMP)
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CDNA3: VALU writes SGPR/VCC: v_readlane, v_readfirstlane, v_cmp,
+ v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by:
+ - VALU reads SGPR as constant requires 1 waite state
+ - VALU reads SGPR as carry-in requires no waite state
+ - v_readlane/v_writelane reads SGPR as lane select requires 4 wait
+ states. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 4
+ && iunit == UNIT_VECTOR
+ && prev_insn->unit == UNIT_VECTOR
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_SRC_REGS]))
+ {
+ if (get_attr_laneselect (insn) != LANESELECT_NO)
+ nops_rqd = 4 - prev_insn->age;
+ else if ((prev_insn->age + nops_rqd) < 1)
+ nops_rqd = 1 - prev_insn->age;
+ }
+
/* CDNA3: v_cmpx followed by
- V_readlane, v_readfirstlane, v_writelane requires 4 wait states
- VALU reads EXEC as constant requires 2 wait states
- other VALU requires no wait state */
if (TARGET_CDNA3_NOPS
&& (prev_insn->age + nops_rqd) < 4
- && gcn_cmpx_insn_p (prev_insn->type)
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
&& get_attr_laneselect (insn) != LANESELECT_NO)
nops_rqd = 4 - prev_insn->age;
else if (TARGET_CDNA3_NOPS
&& (prev_insn->age + nops_rqd) < 2
&& iunit == UNIT_VECTOR
- && gcn_cmpx_insn_p (prev_insn->type)
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
&& TEST_HARD_REG_BIT (ireads, EXECZ_REG))
nops_rqd = 2 - prev_insn->age;
@@ -6436,8 +6456,8 @@ gcn_md_reorg (void)
}
/* Insert the required number of NOPs. */
- for (int i = nops_rqd; i > 0; i--)
- emit_insn_after (gen_nop (), last_insn);
+ if (nops_rqd > 0)
+ emit_insn_after (gen_nops (GEN_INT (nops_rqd-1)), last_insn);
/* Age the previous instructions. We can also ignore writes to
registers subsequently overwritten. */
@@ -7283,6 +7303,11 @@ print_operand_address (FILE *file, rtx mem)
H - print second part of a multi-reg value (high-part of 2-reg value)
J - print third part of a multi-reg value
K - print fourth part of a multi-reg value
+ R Print a scalar register number as an integer. Temporary hack.
+ V - Print a vector register number as an integer. Temporary hack.
+
+ Additionally, the standard builtin c, n, a, and l exist; see gccint's
+ "Output Templates and Operand Substitution" for details.
*/
void
@@ -8131,6 +8156,8 @@ gcn_dwarf_register_span (rtx rtl)
gcn_vectorize_builtin_vectorized_function
#undef TARGET_VECTORIZE_GET_MASK_MODE
#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
+#undef TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index fad42e6..4130cf6 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -324,6 +324,11 @@
"store,storex34,load,atomic,atomicwait,cmpswapx2,no"
(const_string "no"))
+; Identify v_cmp and v_cmpx instructions for "Manually Inserted Wait State"
+; handling.
+
+(define_attr "vcmp" "vcmp,vcmpx,no" (const_string "no"))
+
; Identify instructions that require "Manually Inserted Wait State" if
; a previous instruction writes to VCC. The number gives the number of NOPs.
@@ -424,6 +429,15 @@
"s_nop\t0x0"
[(set_attr "type" "sopp")])
+; Variant of 'nop' that accepts a count argument.
+; s_nop accepts 0x0 to 0xf for 1 to 16 nops; however,
+; as %0 prints decimals, only 0 to 9 (= 1 to 10 nops) can be used.
+(define_insn "nops"
+ [(match_operand 0 "const_int_operand")]
+ ""
+ "s_nop\t0x%0"
+ [(set_attr "type" "sopp")])
+
; FIXME: What should the value of the immediate be? Zero is disallowed, so
; pick 1 for now.
(define_insn "trap"
@@ -566,6 +580,7 @@
[(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat,
flat,flat,flat,flat")
(set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store")
+ (set_attr "vcmp" "*,*,*,*,vcmp,*,*,*,*,*,*,*,*,*,*")
(set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*")
(set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12")
(set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")
@@ -1089,6 +1104,7 @@
s_cmp%D1\t%2, %3
v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "sopc,vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_insn "cstoredi4_vector"
@@ -1099,6 +1115,7 @@
""
"v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_expand "cbranchdi4"
@@ -1125,6 +1142,7 @@
""
"v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_expand "cbranch<mode>4"
@@ -2165,7 +2183,7 @@
? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_load%o0\t%0, %A1%O1 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\t0\;buffer_inv sc1"
: "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\t0\;buffer_wbinvl1_vol");
@@ -2177,7 +2195,7 @@
? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
@@ -2224,7 +2242,7 @@
: TARGET_WBINVL1_CACHE
? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_store%o1\t%A0, %1%O0 %G1"
: "error: cache architectire unspecified");
case 2:
return (TARGET_GLn_CACHE
@@ -2232,7 +2250,7 @@
: TARGET_WBINVL1_CACHE
? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_store%o1\t%A0, %1%O0 %G1"
: "error: cache architecture unspecified");
}
break;
@@ -2252,7 +2270,8 @@
? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\t0\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;"
+ "flat_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\t0\;buffer_inv sc1"
: "error: cache architecture unspecified");
case 2:
@@ -2263,7 +2282,8 @@
? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
+ "global_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "error: cache architecture unspecified");
}
@@ -2347,7 +2367,7 @@
? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0"
: "error: cache architecture unspecified");
case 2:
@@ -2360,7 +2380,7 @@
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)"
: "error: cache architecture unspecified");
@@ -2382,7 +2402,7 @@
? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0\;buffer_inv sc1"
: "error: cache architecture unspecified");
case 2:
@@ -2395,7 +2415,7 @@
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "error: cache architecture unspecified");
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c131577..53e86c8 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3226,7 +3226,7 @@ remove_partial_avx_dependency (void)
break;
}
- /* Only hanlde conversion here. */
+ /* Only handle conversion here. */
machine_mode src_mode
= convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
switch (src_mode)
diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index 2fedbeb..c2db305 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -91,7 +91,6 @@ VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */
VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */
VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */
VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */
-VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */
VECTOR_MODE (FLOAT, HF, 2); /* V2HF */
VECTOR_MODE (FLOAT, BF, 2); /* V2BF */
VECTOR_MODE (FLOAT, HF, 6); /* V6HF */
@@ -102,7 +101,6 @@ VECTOR_MODE (INT, QI, 2); /* V2QI */
VECTOR_MODE (INT, QI, 12); /* V12QI */
VECTOR_MODE (INT, QI, 14); /* V14QI */
VECTOR_MODE (INT, HI, 6); /* V6HI */
-VECTOR_MODE (INT, SI, 64); /* V64SI */
INT_MODE (OI, 32);
INT_MODE (XI, 64);
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index ca6bb83..09a35ef 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -3615,6 +3615,18 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
return NULL_TREE;
}
+ if (TARGET_64BIT)
+ {
+ /* Do not warn when emulating the MS ABI. */
+ if ((TREE_CODE (*node) != FUNCTION_TYPE
+ && TREE_CODE (*node) != METHOD_TYPE)
+ || ix86_function_type_abi (*node) != MS_ABI)
+ warning (OPT_Wattributes, "%qE attribute ignored",
+ name);
+ *no_add_attrs = true;
+ return NULL_TREE;
+ }
+
/* Can combine regparm with all attributes but fastcall, and thiscall. */
if (is_attribute_p ("regparm", name))
{
@@ -3627,7 +3639,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
{
- error ("regparam and thiscall attributes are not compatible");
+ error ("regparm and thiscall attributes are not compatible");
}
cst = TREE_VALUE (args);
@@ -3648,19 +3660,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
return NULL_TREE;
}
- if (TARGET_64BIT)
- {
- /* Do not warn when emulating the MS ABI. */
- if ((TREE_CODE (*node) != FUNCTION_TYPE
- && TREE_CODE (*node) != METHOD_TYPE)
- || ix86_function_type_abi (*node) != MS_ABI)
- warning (OPT_Wattributes, "%qE attribute ignored",
- name);
- *no_add_attrs = true;
- return NULL_TREE;
- }
-
- /* Can combine fastcall with stdcall (redundant) and sseregparm. */
+ /* Can combine fastcall with sseregparm. */
if (is_attribute_p ("fastcall", name))
{
if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3681,8 +3681,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
}
}
- /* Can combine stdcall with fastcall (redundant), regparm and
- sseregparm. */
+ /* Can combine stdcall with regparm and sseregparm. */
else if (is_attribute_p ("stdcall", name))
{
if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3732,6 +3731,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
{
error ("cdecl and thiscall attributes are not compatible");
}
+ if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("regparm and thiscall attributes are not compatible");
+ }
}
/* Can combine sseregparm with all attributes. */
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 4682db85..65e04d3 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -12442,6 +12442,28 @@ static GTY(()) rtx ix86_tls_symbol;
static rtx
ix86_tls_get_addr (void)
{
+ if (cfun->machine->call_saved_registers
+ == TYPE_NO_CALLER_SAVED_REGISTERS)
+ {
+ /* __tls_get_addr doesn't preserve vector registers. When a
+ function with no_caller_saved_registers attribute calls
+ __tls_get_addr, YMM and ZMM registers will be clobbered.
+ Issue an error and suggest -mtls-dialect=gnu2 in this case. */
+ if (cfun->machine->func_type == TYPE_NORMAL)
+ error (G_("%<-mtls-dialect=gnu2%> must be used with a function"
+ " with the %<no_caller_saved_registers%> attribute"));
+ else
+ error (cfun->machine->func_type == TYPE_EXCEPTION
+ ? G_("%<-mtls-dialect=gnu2%> must be used with an"
+ " exception service routine")
+ : G_("%<-mtls-dialect=gnu2%> must be used with an"
+ " interrupt service routine"));
+ /* Don't issue the same error twice. */
+ cfun->machine->func_type = TYPE_NORMAL;
+ cfun->machine->call_saved_registers
+ = TYPE_DEFAULT_CALL_SAVED_REGISTERS;
+ }
+
if (!ix86_tls_symbol)
{
const char *sym
@@ -20007,7 +20029,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
tree utype, ures, vce;
utype = unsigned_type_for (TREE_TYPE (arg0));
/* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR
- instead of ABS_EXPR to hanlde overflow case(TYPE_MIN). */
+ instead of ABS_EXPR to handle overflow case(TYPE_MIN). */
ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0);
gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
loc = gimple_location (stmt);
@@ -21491,8 +21513,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
/* Register pair for mask registers. */
if (mode == P2QImode || mode == P2HImode)
return 2;
- if (mode == V64SFmode || mode == V64SImode)
- return 4;
+
return 1;
}
@@ -23132,7 +23153,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
So current solution is make constant disp as cheap as possible. */
if (GET_CODE (addr) == PLUS
&& x86_64_immediate_operand (XEXP (addr, 1), Pmode)
- /* Only hanlde (reg + disp) since other forms of addr are mostly LEA,
+ /* Only handle (reg + disp) since other forms of addr are mostly LEA,
there's no additional cost for the plus of disp. */
&& register_operand (XEXP (addr, 0), Pmode))
{
@@ -25211,20 +25232,14 @@ asm_preferred_eh_data_format (int code, int global)
return DW_EH_PE_absptr;
}
-/* Implement targetm.vectorize.builtin_vectorization_cost. */
+/* Worker for ix86_builtin_vectorization_cost and the fallback calls
+ from ix86_vector_costs::add_stmt_cost. */
static int
-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype, int)
+ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost,
+ machine_mode mode)
{
- bool fp = false;
- machine_mode mode = TImode;
+ bool fp = FLOAT_MODE_P (mode);
int index;
- if (vectype != NULL)
- {
- fp = FLOAT_TYPE_P (vectype);
- mode = TYPE_MODE (vectype);
- }
-
switch (type_of_cost)
{
case scalar_stmt:
@@ -25283,14 +25298,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
COSTS_N_INSNS
(ix86_cost->gather_static
+ ix86_cost->gather_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ * GET_MODE_NUNITS (mode)) / 2);
case vector_scatter_store:
return ix86_vec_cost (mode,
COSTS_N_INSNS
(ix86_cost->scatter_static
+ ix86_cost->scatter_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ * GET_MODE_NUNITS (mode)) / 2);
case cond_branch_taken:
return ix86_cost->cond_taken_branch_cost;
@@ -25308,7 +25323,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
case vec_construct:
{
- int n = TYPE_VECTOR_SUBPARTS (vectype);
+ int n = GET_MODE_NUNITS (mode);
/* N - 1 element inserts into an SSE vector, the possible
GPR -> XMM move is accounted for in add_stmt_cost. */
if (GET_MODE_BITSIZE (mode) <= 128)
@@ -25336,6 +25351,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
}
}
+/* Implement targetm.vectorize.builtin_vectorization_cost. */
+static int
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+ tree vectype, int)
+{
+ machine_mode mode = TImode;
+ if (vectype != NULL)
+ mode = TYPE_MODE (vectype);
+ return ix86_default_vector_cost (type_of_cost, mode);
+}
+
/* This function returns the calling abi specific va_list type node.
It returns the FNDECL specific va_list type. */
@@ -25789,7 +25815,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
unsigned
ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, slp_tree node,
- tree vectype, int misalign,
+ tree vectype, int,
vect_cost_model_location where)
{
unsigned retval = 0;
@@ -26128,32 +26154,24 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
(AGU and load ports). Try to account for this by scaling the
construction cost by the number of elements involved. */
if ((kind == vec_construct || kind == vec_to_scalar)
- && ((stmt_info
- && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
- || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
- && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
- && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+ && ((node
+ && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+ || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
+ && SLP_TREE_LANES (node) == 1))
+ && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
+ (SLP_TREE_REPRESENTATIVE (node))))
!= INTEGER_CST))
- || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
- == VMAT_GATHER_SCATTER)))
- || (node
- && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
- || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
- && SLP_TREE_LANES (node) == 1))
- && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
- (SLP_TREE_REPRESENTATIVE (node))))
- != INTEGER_CST))
- || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
- == VMAT_GATHER_SCATTER)))))
- {
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
+ == VMAT_GATHER_SCATTER)))))
+ {
+ stmt_cost = ix86_default_vector_cost (kind, mode);
stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
}
else if ((kind == vec_construct || kind == scalar_to_vec)
&& node
&& SLP_TREE_DEF_TYPE (node) == vect_external_def)
{
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ stmt_cost = ix86_default_vector_cost (kind, mode);
unsigned i;
tree op;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
@@ -26217,7 +26235,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
TREE_VISITED (op) = 0;
}
if (stmt_cost == -1)
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ stmt_cost = ix86_default_vector_cost (kind, mode);
if (kind == vec_perm && vectype
&& GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index eb52699..a50475b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2968,7 +2968,8 @@
(match_operand:SWI248 1 "const_int_operand"))]
"optimize_insn_for_size_p () && optimize_size > 1
&& operands[1] != const0_rtx
- && operands[1] != constm1_rtx
+ && (operands[1] != constm1_rtx
+ || (<MODE>mode == DImode && LEGACY_INT_REG_P (operands[0])))
&& IN_RANGE (INTVAL (operands[1]), -128, 127)
&& !ix86_red_zone_used
&& REGNO (operands[0]) != SP_REG"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d88c3d6..ec74f93 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -21729,6 +21729,19 @@
(const_string "orig")))
(set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
+;; Eliminate redundancy caused by
+;; /* Special case TImode to 128-bit vector conversions via V2DI. */
+;; in ix86_expand_vector_move
+
+(define_split
+ [(set (match_operand:V2DI 0 "register_operand")
+ (vec_concat:V2DI
+ (subreg:DI (match_operand:TI 1 "register_operand") 0)
+ (subreg:DI (match_dup 1) 8)))]
+ "TARGET_SSE2 && ix86_pre_reload_split ()"
+ [(set (match_dup 0)
+ (subreg:V2DI (match_dup 1) 0))])
+
(define_insn "*vec_concatv2di_0"
[(set (match_operand:V2DI 0 "register_operand" "=v,v ,x")
(vec_concat:V2DI
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index d897763..5fc8665 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -823,8 +823,6 @@ typedef struct {
#define CASE_VECTOR_MODE Pmode
-#define CASE_VECTOR_SHORTEN_MODE(MIN, MAX, BODY) Pmode
-
/* Define this as 1 if `char' should by default be signed; else as 0. */
#ifndef DEFAULT_SIGNED_CHAR
#define DEFAULT_SIGNED_CHAR 1
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index d326ca4..9796839 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -120,6 +120,51 @@ Target RejectNegative Alias(misa=,sm_89)
march-map=sm_90a
Target RejectNegative Alias(misa=,sm_89)
+march-map=sm_100
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_100f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_100a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_101
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_101f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_101a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_103
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_103f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_103a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_120
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_120f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_120a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_121
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_121f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_121a
+Target RejectNegative Alias(misa=,sm_89)
+
Enum
Name(ptx_version) Type(enum ptx_version)
Known PTX ISA versions (for use with the -mptx= option):
diff --git a/gcc/config/riscv/arch-canonicalize b/gcc/config/riscv/arch-canonicalize
index fd55255..34dad45 100755
--- a/gcc/config/riscv/arch-canonicalize
+++ b/gcc/config/riscv/arch-canonicalize
@@ -32,7 +32,7 @@ import itertools
from functools import reduce
SUPPORTED_ISA_SPEC = ["2.2", "20190608", "20191213"]
-CANONICAL_ORDER = "imafdgqlcbkjtpvn"
+CANONICAL_ORDER = "imafdqlcbkjtpvnh"
LONG_EXT_PREFIXES = ['z', 's', 'h', 'x']
#
diff --git a/gcc/config/riscv/gen-riscv-mcpu-texi.cc b/gcc/config/riscv/gen-riscv-mcpu-texi.cc
new file mode 100644
index 0000000..9681438
--- /dev/null
+++ b/gcc/config/riscv/gen-riscv-mcpu-texi.cc
@@ -0,0 +1,43 @@
+#include <string>
+#include <vector>
+#include <stdio.h>
+
+int
+main ()
+{
+ puts ("@c Copyright (C) 2025 Free Software Foundation, Inc.");
+ puts ("@c This is part of the GCC manual.");
+ puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi.");
+ puts ("");
+ puts ("@c This file is generated automatically using");
+ puts ("@c gcc/config/riscv/gen-riscv-mcpu-texi.cc from:");
+ puts ("@c gcc/config/riscv/riscv-cores.def");
+ puts ("");
+ puts ("@c Please *DO NOT* edit manually.");
+ puts ("");
+ puts ("@samp{Core Name}");
+ puts ("");
+ puts ("@opindex mcpu");
+ puts ("@item -mcpu=@var{processor-string}");
+ puts ("Use architecture of and optimize the output for the given processor, specified");
+ puts ("by particular CPU name. Permissible values for this option are:");
+ puts ("");
+ puts ("");
+
+ std::vector<std::string> coreNames;
+
+#define RISCV_CORE(CORE_NAME, ARCH, MICRO_ARCH) \
+ coreNames.push_back (CORE_NAME);
+#include "riscv-cores.def"
+#undef RISCV_CORE
+
+ for (size_t i = 0; i < coreNames.size(); ++i) {
+ if (i == coreNames.size() - 1) {
+ printf("@samp{%s}.\n", coreNames[i].c_str());
+ } else {
+ printf("@samp{%s},\n\n", coreNames[i].c_str());
+ }
+ }
+
+ return 0;
+}
diff --git a/gcc/config/riscv/gen-riscv-mtune-texi.cc b/gcc/config/riscv/gen-riscv-mtune-texi.cc
new file mode 100644
index 0000000..1bdfe2a
--- /dev/null
+++ b/gcc/config/riscv/gen-riscv-mtune-texi.cc
@@ -0,0 +1,41 @@
+#include <string>
+#include <vector>
+#include <stdio.h>
+
+int
+main ()
+{
+ puts ("@c Copyright (C) 2025 Free Software Foundation, Inc.");
+ puts ("@c This is part of the GCC manual.");
+ puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi.");
+ puts ("");
+ puts ("@c This file is generated automatically using");
+ puts ("@c gcc/config/riscv/gen-riscv-mtune-texi.cc from:");
+ puts ("@c gcc/config/riscv/riscv-cores.def");
+ puts ("");
+ puts ("@c Please *DO NOT* edit manually.");
+ puts ("");
+ puts ("@samp{Tune Name}");
+ puts ("");
+ puts ("@opindex mtune");
+ puts ("@item -mtune=@var{processor-string}");
+ puts ("Optimize the output for the given processor, specified by microarchitecture or");
+ puts ("particular CPU name. Permissible values for this option are:");
+ puts ("");
+ puts ("");
+
+ std::vector<std::string> tuneNames;
+
+#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO) \
+ tuneNames.push_back (TUNE_NAME);
+#include "riscv-cores.def"
+#undef RISCV_TUNE
+
+ for (size_t i = 0; i < tuneNames.size(); ++i) {
+ printf("@samp{%s},\n\n", tuneNames[i].c_str());
+ }
+
+ puts ("and all valid options for @option{-mcpu=}.");
+
+ return 0;
+}
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 54eb8c6..c9c8328 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -5693,6 +5693,7 @@ expand_vx_binary_vxrm_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2, int unspec,
switch (unspec)
{
+ case UNSPEC_VAADD:
case UNSPEC_VAADDU:
icode = code_for_pred_scalar (unspec, mode);
break;
@@ -5717,6 +5718,7 @@ expand_vx_binary_vxrm_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, int unspec,
switch (unspec)
{
+ case UNSPEC_VAADD:
case UNSPEC_VAADDU:
icode = code_for_pred_scalar (unspec, mode);
break;
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index df924fa..44ef44a 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -275,13 +275,13 @@ loop_invariant_op_p (class loop *loop,
/* Return true if the variable should be counted into liveness. */
static bool
variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info,
- slp_tree node ATTRIBUTE_UNUSED, tree var, bool lhs_p)
+ slp_tree node, tree var, bool lhs_p)
{
if (!var)
return false;
gimple *stmt = STMT_VINFO_STMT (stmt_info);
stmt_info = vect_stmt_to_vectorize (stmt_info);
- enum stmt_vec_info_type type = STMT_VINFO_TYPE (stmt_info);
+ enum stmt_vec_info_type type = SLP_TREE_TYPE (node);
if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
{
if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE
@@ -400,7 +400,7 @@ costs::compute_local_live_ranges (
pair &live_range
= live_ranges->get_or_insert (lhs, &existed_p);
gcc_assert (!existed_p);
- if (STMT_VINFO_MEMORY_ACCESS_TYPE (program_point.stmt_info)
+ if (SLP_TREE_MEMORY_ACCESS_TYPE (*node)
== VMAT_LOAD_STORE_LANES)
point = get_first_lane_point (program_points,
program_point.stmt_info);
@@ -418,8 +418,7 @@ costs::compute_local_live_ranges (
bool existed_p = false;
pair &live_range
= live_ranges->get_or_insert (var, &existed_p);
- if (STMT_VINFO_MEMORY_ACCESS_TYPE (
- program_point.stmt_info)
+ if (SLP_TREE_MEMORY_ACCESS_TYPE (*node)
== VMAT_LOAD_STORE_LANES)
point = get_last_lane_point (program_points,
program_point.stmt_info);
@@ -602,13 +601,13 @@ get_store_value (gimple *stmt)
/* Return true if additional vector vars needed. */
bool
costs::need_additional_vector_vars_p (stmt_vec_info stmt_info,
- slp_tree node ATTRIBUTE_UNUSED)
+ slp_tree node)
{
- enum stmt_vec_info_type type = STMT_VINFO_TYPE (stmt_info);
+ enum stmt_vec_info_type type = SLP_TREE_TYPE (node);
if (type == load_vec_info_type || type == store_vec_info_type)
{
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
- && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
return true;
machine_mode mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
@@ -694,7 +693,7 @@ costs::update_local_live_ranges (
if (!node)
continue;
- if (STMT_VINFO_TYPE (stmt_info) == undef_vec_info_type)
+ if (SLP_TREE_TYPE (*node) == undef_vec_info_type)
continue;
for (j = 0; j < gimple_phi_num_args (phi); j++)
@@ -773,7 +772,7 @@ costs::update_local_live_ranges (
slp_tree *node = vinfo_slp_map.get (stmt_info);
if (!node)
continue;
- enum stmt_vec_info_type type = STMT_VINFO_TYPE (stmt_info);
+ enum stmt_vec_info_type type = SLP_TREE_TYPE (*node);
if (need_additional_vector_vars_p (stmt_info, *node))
{
/* For non-adjacent load/store STMT, we will potentially
@@ -1086,7 +1085,7 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
load/store. */
static int
segment_loadstore_group_size (enum vect_cost_for_stmt kind,
- stmt_vec_info stmt_info)
+ stmt_vec_info stmt_info, slp_tree node)
{
if (stmt_info
&& (kind == vector_load || kind == vector_store)
@@ -1094,7 +1093,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind,
{
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
if (stmt_info
- && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES)
return DR_GROUP_SIZE (stmt_info);
}
return 0;
@@ -1108,7 +1107,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind,
unsigned
costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
stmt_vec_info stmt_info,
- slp_tree, tree vectype, int stmt_cost)
+ slp_tree node, tree vectype, int stmt_cost)
{
const cpu_vector_cost *costs = get_vector_costs ();
switch (kind)
@@ -1131,7 +1130,8 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
each vector in the group. Here we additionally add permute
costs for each. */
/* TODO: Indexed and ordered/unordered cost. */
- int group_size = segment_loadstore_group_size (kind, stmt_info);
+ int group_size = segment_loadstore_group_size (kind, stmt_info,
+ node);
if (group_size > 1)
{
switch (group_size)
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index b4f2d13..0a9fcef 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -4040,6 +4040,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
switch (XINT (op, 1))
{
case UNSPEC_VAADDU:
+ case UNSPEC_VAADD:
*total
= get_vector_binary_rtx_cost (op, scalar2vr_cost);
break;
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 7aac56a..a7eaa8b 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -229,8 +229,41 @@ s-riscv-ext.texi: build/gen-riscv-ext-texi$(build_exeext)
$(SHELL) $(srcdir)/../move-if-change tmp-riscv-ext.texi $(srcdir)/doc/riscv-ext.texi
$(STAMP) s-riscv-ext.texi
-# Run `riscv-regen' after you changed or added anything from riscv-ext*.def
+RISCV_CORES_DEFS = \
+ $(srcdir)/config/riscv/riscv-cores.def
+
+build/gen-riscv-mtune-texi.o: $(srcdir)/config/riscv/gen-riscv-mtune-texi.cc \
+ $(RISCV_CORES_DEFS)
+ $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@
+
+build/gen-riscv-mcpu-texi.o: $(srcdir)/config/riscv/gen-riscv-mcpu-texi.cc \
+ $(RISCV_CORES_DEFS)
+ $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@
+
+build/gen-riscv-mtune-texi$(build_exeext): build/gen-riscv-mtune-texi.o
+ $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $<
+
+build/gen-riscv-mcpu-texi$(build_exeext): build/gen-riscv-mcpu-texi.o
+ $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $<
+
+$(srcdir)/doc/riscv-mtune.texi: $(RISCV_CORES_DEFS)
+$(srcdir)/doc/riscv-mtune.texi: s-riscv-mtune.texi ; @true
+
+$(srcdir)/doc/riscv-mcpu.texi: $(RISCV_CORES_DEFS)
+$(srcdir)/doc/riscv-mcpu.texi: s-riscv-mcpu.texi ; @true
+
+s-riscv-mtune.texi: build/gen-riscv-mtune-texi$(build_exeext)
+ $(RUN_GEN) build/gen-riscv-mtune-texi$(build_exeext) > tmp-riscv-mtune.texi
+ $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mtune.texi $(srcdir)/doc/riscv-mtune.texi
+ $(STAMP) s-riscv-mtune.texi
+
+s-riscv-mcpu.texi: build/gen-riscv-mcpu-texi$(build_exeext)
+ $(RUN_GEN) build/gen-riscv-mcpu-texi$(build_exeext) > tmp-riscv-mcpu.texi
+ $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mcpu.texi $(srcdir)/doc/riscv-mcpu.texi
+ $(STAMP) s-riscv-mcpu.texi
+
+# Run `riscv-regen' after you changed or added anything from riscv-ext*.def and riscv-cores*.def
.PHONY: riscv-regen
-riscv-regen: s-riscv-ext.texi s-riscv-ext.opt
+riscv-regen: s-riscv-ext.texi s-riscv-ext.opt s-riscv-mtune.texi s-riscv-mcpu.texi
diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md
index dbb48a4..aa3b6fb 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -4014,11 +4014,11 @@
UNSPEC_VSSRL UNSPEC_VSSRA])
(define_int_iterator VSAT_VX_OP_V_VDUP [
- UNSPEC_VAADDU
+ UNSPEC_VAADDU UNSPEC_VAADD
])
(define_int_iterator VSAT_VX_OP_VDUP_V [
- UNSPEC_VAADDU
+ UNSPEC_VAADDU UNSPEC_VAADD
])
(define_int_iterator VSAT_ARITH_OP [UNSPEC_VAADDU UNSPEC_VAADD
@@ -4056,11 +4056,11 @@
(UNSPEC_VNCLIPU "vnclip")])
(define_int_attr sat_op_v_vdup [
- (UNSPEC_VAADDU "aaddu")
+ (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd")
])
(define_int_attr sat_op_vdup_v [
- (UNSPEC_VAADDU "aaddu")
+ (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd")
])
(define_int_attr misc_op [(UNSPEC_VMSBF "sbf") (UNSPEC_VMSIF "sif") (UNSPEC_VMSOF "sof")
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 16227e5..764b499 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -5174,6 +5174,7 @@ public:
protected:
void update_target_cost_per_stmt (vect_cost_for_stmt, stmt_vec_info,
+ slp_tree node,
vect_cost_model_location, unsigned int);
void density_test (loop_vec_info);
void adjust_vect_cost_per_loop (loop_vec_info);
@@ -5321,6 +5322,7 @@ rs6000_adjust_vect_cost_per_stmt (enum vect_cost_for_stmt kind,
void
rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
stmt_vec_info stmt_info,
+ slp_tree node,
vect_cost_model_location where,
unsigned int orig_count)
{
@@ -5381,12 +5383,12 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
or may not need to apply. When finalizing the cost of the loop,
the extra penalty is applied when the load density heuristics
are satisfied. */
- if (kind == vec_construct && stmt_info
- && STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
- && (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
- || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_STRIDED_SLP))
+ if (kind == vec_construct && node
+ && SLP_TREE_TYPE (node) == load_vec_info_type
+ && (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+ || SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP))
{
- tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ tree vectype = SLP_TREE_VECTYPE (node);
unsigned int nunits = vect_nunits_for_cost (vectype);
/* As PR103702 shows, it's possible that vectorizer wants to do
costings for only one unit here, it's no need to do any
@@ -5415,7 +5417,7 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
unsigned
rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind,
- stmt_vec_info stmt_info, slp_tree,
+ stmt_vec_info stmt_info, slp_tree node,
tree vectype, int misalign,
vect_cost_model_location where)
{
@@ -5433,7 +5435,7 @@ rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind,
retval = adjust_cost_for_freq (stmt_info, where, count * stmt_cost);
m_costs[where] += retval;
- update_target_cost_per_stmt (kind, stmt_info, where, orig_count);
+ update_target_cost_per_stmt (kind, stmt_info, node, where, orig_count);
}
return retval;
@@ -10318,15 +10320,18 @@ can_be_rotated_to_negative_lis (HOST_WIDE_INT c, int *rot)
/* case b. xx0..01..1xx: some of 15 x's (and some of 16 0's) are
rotated over the highest bit. */
- int pos_one = clz_hwi ((c << 16) >> 16);
- middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one));
- int middle_ones = clz_hwi (~(c << pos_one));
- if (middle_zeros >= 16 && middle_ones >= 33)
+ unsigned HOST_WIDE_INT uc = c;
+ int pos_one = clz_hwi ((HOST_WIDE_INT) (uc << 16) >> 16);
+ if (pos_one != 0)
{
- *rot = pos_one;
- return true;
+ middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one));
+ int middle_ones = clz_hwi (~(uc << pos_one));
+ if (middle_zeros >= 16 && middle_ones >= 33)
+ {
+ *rot = pos_one;
+ return true;
+ }
}
-
return false;
}
@@ -10443,7 +10448,8 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask)
if (lz >= HOST_BITS_PER_WIDE_INT)
return false;
- int middle_ones = clz_hwi (~(c << lz));
+ unsigned HOST_WIDE_INT uc = c;
+ int middle_ones = clz_hwi (~(uc << lz));
if (tz + lz + middle_ones >= ones
&& (tz - lz) < HOST_BITS_PER_WIDE_INT
&& tz < HOST_BITS_PER_WIDE_INT)
@@ -10477,7 +10483,7 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask)
if (!IN_RANGE (pos_first_1, 1, HOST_BITS_PER_WIDE_INT-1))
return false;
- middle_ones = clz_hwi (~c << pos_first_1);
+ middle_ones = clz_hwi ((~(unsigned HOST_WIDE_INT) c) << pos_first_1);
middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_first_1));
if (pos_first_1 < HOST_BITS_PER_WIDE_INT
&& middle_ones + middle_zeros < HOST_BITS_PER_WIDE_INT
@@ -10579,7 +10585,8 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c, int *num_insns)
{
/* li/lis; rldicX */
unsigned HOST_WIDE_INT imm = (c | ~mask);
- imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift));
+ if (shift != 0)
+ imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift));
count_or_emit_insn (temp, GEN_INT (imm));
if (shift != 0)
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 9c718ca..e31ee40 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1969,7 +1969,7 @@
[(set (match_dup 0) (plus:GPR (match_dup 1) (match_dup 3)))
(set (match_dup 0) (plus:GPR (match_dup 0) (match_dup 4)))]
{
- HOST_WIDE_INT val = INTVAL (operands[2]);
+ unsigned HOST_WIDE_INT val = UINTVAL (operands[2]);
HOST_WIDE_INT low = sext_hwi (val, 16);
HOST_WIDE_INT rest = trunc_int_for_mode (val - low, <MODE>mode);
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index d760a7e..6becad1 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -128,6 +128,8 @@ extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx);
extern void s390_expand_vec_init (rtx, rtx);
extern rtx s390_expand_merge_perm_const (machine_mode, bool);
extern void s390_expand_merge (rtx, rtx, rtx, bool);
+extern void s390_expand_int_spaceship (rtx, rtx, rtx, rtx);
+extern void s390_expand_fp_spaceship (rtx, rtx, rtx, rtx);
extern rtx s390_build_signbit_mask (machine_mode);
extern rtx s390_return_addr_rtx (int, rtx);
extern rtx s390_back_chain_rtx (void);
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index abe551c..012b6db 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -8213,6 +8213,167 @@ s390_expand_atomic (machine_mode mode, enum rtx_code code,
NULL_RTX, 1, OPTAB_DIRECT), 1);
}
+/* Expand integer op0 = op1 <=> op2, i.e.,
+ op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : 1.
+
+ Signedness is specified by op3. If op3 equals 1, then perform an unsigned
+ comparison, and if op3 equals -1, then perform a signed comparison.
+
+ For integer comparisons we strive for a sequence like
+ CR[L] ; LHI ; LOCHIL ; LOCHIH
+ where the first three instructions fit into a group. */
+
+void
+s390_expand_int_spaceship (rtx op0, rtx op1, rtx op2, rtx op3)
+{
+ gcc_assert (op3 == const1_rtx || op3 == constm1_rtx);
+
+ rtx cc, cond_lt, cond_gt;
+ machine_mode cc_mode;
+ machine_mode mode = GET_MODE (op1);
+
+ /* Prior VXE3 emulate a 128-bit comparison by breaking it up into three
+ comparisons. First test the high halfs. In case they equal, then test
+ the low halfs. Finally, test for equality. Depending on the results
+ make use of LOCs. */
+ if (mode == TImode && !TARGET_VXE3)
+ {
+ gcc_assert (TARGET_VX);
+ op1
+ = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op1, TImode, 0));
+ op2
+ = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op2, TImode, 0));
+ rtx lab = gen_label_rtx ();
+ rtx ccz = gen_rtx_REG (CCZmode, CC_REGNUM);
+ /* Compare high halfs for equality.
+ VEC[L]G op1, op2 sets
+ CC1 if high(op1) < high(op2)
+ and
+ CC2 if high(op1) > high(op2). */
+ machine_mode cc_mode = op3 == const1_rtx ? CCUmode : CCSmode;
+ rtx lane0 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+ emit_insn (gen_rtx_SET (
+ gen_rtx_REG (cc_mode, CC_REGNUM),
+ gen_rtx_COMPARE (cc_mode,
+ gen_rtx_VEC_SELECT (DImode, op1, lane0),
+ gen_rtx_VEC_SELECT (DImode, op2, lane0))));
+ s390_emit_jump (lab, gen_rtx_NE (CCZmode, ccz, const0_rtx));
+ /* At this point we know that the high halfs equal.
+ VCHLGS op2, op1 sets CC1 if low(op1) < low(op2) */
+ emit_insn (gen_rtx_PARALLEL (
+ VOIDmode,
+ gen_rtvec (2,
+ gen_rtx_SET (gen_rtx_REG (CCVIHUmode, CC_REGNUM),
+ gen_rtx_COMPARE (CCVIHUmode, op2, op1)),
+ gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (V2DImode)))));
+ emit_label (lab);
+ emit_insn (gen_rtx_SET (op0, const1_rtx));
+ emit_insn (
+ gen_movsicc (op0,
+ gen_rtx_LTU (CCUmode, gen_rtx_REG (CCUmode, CC_REGNUM),
+ const0_rtx),
+ constm1_rtx, op0));
+ /* Deal with the case where both halfs equal. */
+ emit_insn (gen_rtx_PARALLEL (
+ VOIDmode,
+ gen_rtvec (2,
+ gen_rtx_SET (gen_rtx_REG (CCVEQmode, CC_REGNUM),
+ gen_rtx_COMPARE (CCVEQmode, op1, op2)),
+ gen_rtx_SET (gen_reg_rtx (V2DImode),
+ gen_rtx_EQ (V2DImode, op1, op2)))));
+ emit_insn (gen_movsicc (op0, gen_rtx_EQ (CCZmode, ccz, const0_rtx),
+ const0_rtx, op0));
+ return;
+ }
+
+ if (mode == QImode || mode == HImode)
+ {
+ rtx_code extend = op3 == const1_rtx ? ZERO_EXTEND : SIGN_EXTEND;
+ op1 = simplify_gen_unary (extend, SImode, op1, mode);
+ op1 = force_reg (SImode, op1);
+ op2 = simplify_gen_unary (extend, SImode, op2, mode);
+ op2 = force_reg (SImode, op2);
+ mode = SImode;
+ }
+
+ if (op3 == const1_rtx)
+ {
+ cc_mode = CCUmode;
+ cc = gen_rtx_REG (cc_mode, CC_REGNUM);
+ cond_lt = gen_rtx_LTU (mode, cc, const0_rtx);
+ cond_gt = gen_rtx_GTU (mode, cc, const0_rtx);
+ }
+ else
+ {
+ cc_mode = CCSmode;
+ cc = gen_rtx_REG (cc_mode, CC_REGNUM);
+ cond_lt = gen_rtx_LT (mode, cc, const0_rtx);
+ cond_gt = gen_rtx_GT (mode, cc, const0_rtx);
+ }
+
+ emit_insn (gen_rtx_SET (cc, gen_rtx_COMPARE (cc_mode, op1, op2)));
+ emit_move_insn (op0, const0_rtx);
+ emit_insn (gen_movsicc (op0, cond_lt, constm1_rtx, op0));
+ emit_insn (gen_movsicc (op0, cond_gt, const1_rtx, op0));
+}
+
+/* Expand floating-point op0 = op1 <=> op2, i.e.,
+ op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : op1 > op2 ? 1 : 2.
+
+ If op3 equals const0_rtx, then we are interested in the compare only (see
+ test spaceship-fp-4.c). Otherwise, op3 is a CONST_INT different than
+ const1_rtx and constm1_rtx which is used in order to set op0 for unordered.
+
+ Emit a branch-only solution, i.e., let if-convert fold the branches into
+ LOCs if applicable. This has the benefit that the solution is also
+ applicable if we are only interested in the compare, i.e., if op3 equals
+ const0_rtx.
+ */
+
+void
+s390_expand_fp_spaceship (rtx op0, rtx op1, rtx op2, rtx op3)
+{
+ gcc_assert (op3 != const1_rtx && op3 != constm1_rtx);
+
+ machine_mode mode = GET_MODE (op1);
+ machine_mode cc_mode = s390_select_ccmode (LTGT, op1, op2);
+ rtx cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+ rtx cond_unordered = gen_rtx_UNORDERED (mode, cc_reg, const0_rtx);
+ rtx cond_eq = gen_rtx_EQ (mode, cc_reg, const0_rtx);
+ rtx cond_gt = gen_rtx_GT (mode, cc_reg, const0_rtx);
+ rtx_insn *insn;
+ rtx l_unordered = gen_label_rtx ();
+ rtx l_eq = gen_label_rtx ();
+ rtx l_gt = gen_label_rtx ();
+ rtx l_end = gen_label_rtx ();
+
+ s390_emit_compare (VOIDmode, LTGT, op1, op2);
+ if (!flag_finite_math_only)
+ {
+ insn = s390_emit_jump (l_unordered, cond_unordered);
+ add_reg_br_prob_note (insn, profile_probability::very_unlikely ());
+ }
+ insn = s390_emit_jump (l_eq, cond_eq);
+ add_reg_br_prob_note (insn, profile_probability::unlikely ());
+ insn = s390_emit_jump (l_gt, cond_gt);
+ add_reg_br_prob_note (insn, profile_probability::even ());
+ emit_move_insn (op0, constm1_rtx);
+ emit_jump (l_end);
+ emit_label (l_eq);
+ emit_move_insn (op0, const0_rtx);
+ emit_jump (l_end);
+ emit_label (l_gt);
+ emit_move_insn (op0, const1_rtx);
+ if (!flag_finite_math_only)
+ {
+ emit_jump (l_end);
+ emit_label (l_unordered);
+ rtx unord_val = op3 == const0_rtx ? const2_rtx : op3;
+ emit_move_insn (op0, unord_val);
+ }
+ emit_label (l_end);
+}
+
/* This is called from dwarf2out.cc via TARGET_ASM_OUTPUT_DWARF_DTPREL.
We need to emit DTP-relative relocations. */
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 1edbfde..8cc48b0 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -1527,6 +1527,27 @@
operands[0] = SET_DEST (PATTERN (curr_insn));
})
+; Restrict spaceship optab to z13 or later since there we have
+; LOAD HALFWORD IMMEDIATE ON CONDITION.
+
+(define_mode_iterator SPACESHIP_INT [(TI "TARGET_VX") DI SI HI QI])
+(define_expand "spaceship<mode>4"
+ [(match_operand:SI 0 "register_operand")
+ (match_operand:SPACESHIP_INT 1 "register_operand")
+ (match_operand:SPACESHIP_INT 2 "register_operand")
+ (match_operand:SI 3 "const_int_operand")]
+ "TARGET_Z13 && TARGET_64BIT"
+ "s390_expand_int_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;")
+
+(define_mode_iterator SPACESHIP_BFP [TF DF SF])
+(define_expand "spaceship<mode>4"
+ [(match_operand:SI 0 "register_operand")
+ (match_operand:SPACESHIP_BFP 1 "register_operand")
+ (match_operand:SPACESHIP_BFP 2 "register_operand")
+ (match_operand:SI 3 "const_int_operand")]
+ "TARGET_Z13 && TARGET_64BIT && TARGET_HARD_FLOAT"
+ "s390_expand_fp_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;")
+
; (TF|DF|SF|TD|DD|SD) instructions
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 02554c5..d75cba4 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -4702,25 +4702,49 @@ static bool
xtensa_is_insn_L32R_p (const rtx_insn *insn)
{
rtx pat, dest, src;
+ machine_mode mode;
- /* "PATTERN (insn)" can be used without checking, see insn_cost()
- in gcc/rtlanal.cc. */
+ /* RTX insns that are not "(set (reg) ...)" cannot become L32R instructions:
+ - it is permitted to apply PATTERN() to the insn without validation.
+ See insn_cost() in gcc/rtlanal.cc.
+ - it is used register_operand() instead of REG() to identify things that
+ don't look like REGs but will eventually become so as well. */
if (GET_CODE (pat = PATTERN (insn)) != SET
|| ! register_operand (dest = SET_DEST (pat), VOIDmode))
return false;
+ /* If the source is a reference to a literal pool entry, then the insn
+ obviously corresponds to an L32R instruction. */
if (constantpool_mem_p (src = SET_SRC (pat)))
return true;
- /* Return true if:
- - CONST16 instruction is not configured, and
- - the source is some constant, and also
- - negation of "the source is integer and fits into the immediate
- field". */
- return (!TARGET_CONST16
- && CONSTANT_P (src)
- && ! ((GET_MODE (dest) == SImode || GET_MODE (dest) == HImode)
- && CONST_INT_P (src) && xtensa_simm12b (INTVAL (src))));
+ /* Similarly, an insn whose source is not a constant obviously does not
+ correspond to L32R. */
+ if (! CONSTANT_P (src))
+ return false;
+
+ /* If the source is a CONST_INT whose value fits into signed 12 bits, then
+ the insn corresponds to a MOVI instruction (rather than an L32R one),
+ regardless of the configuration of TARGET_CONST16 or
+ TARGET_AUTOLITPOOLS. Note that the destination register can be non-
+ SImode. */
+ if (((mode = GET_MODE (dest)) == SImode
+ || mode == HImode || mode == SFmode)
+ && CONST_INT_P (src) && xtensa_simm12b (INTVAL (src)))
+ return false;
+
+ /* If TARGET_CONST16 is configured, constants of the remaining forms
+ correspond to pairs of CONST16 instructions, not L32R. */
+ if (TARGET_CONST16)
+ return false;
+
+ /* The last remaining form of constant is one of the following:
+ - CONST_INTs with large values
+ - floating-point constants
+ - symbolic constants
+ and is all handled by a relaxed MOVI instruction, which is later
+ converted to an L32R instruction by the assembler. */
+ return true;
}
/* Compute a relative costs of RTL insns. This is necessary in order to