aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/aarch64/aarch64-cores.def2
-rw-r--r--gcc/config/aarch64/aarch64-cost-tables.h54
-rw-r--r--gcc/config/aarch64/aarch64-option-extensions.def12
-rw-r--r--gcc/config/aarch64/aarch64-protos.h1
-rw-r--r--gcc/config/aarch64/aarch64-simd.md5
-rw-r--r--gcc/config/aarch64/aarch64-sme.md12
-rw-r--r--gcc/config/aarch64/aarch64-sve-builtins-sme.def3
-rw-r--r--gcc/config/aarch64/aarch64-sve-builtins.cc3
-rw-r--r--gcc/config/aarch64/aarch64-sve.md671
-rw-r--r--gcc/config/aarch64/aarch64-sve2.md10
-rw-r--r--gcc/config/aarch64/aarch64.cc144
-rw-r--r--gcc/config/aarch64/cortex-a57-fma-steering.cc5
-rw-r--r--gcc/config/aarch64/iterators.md23
-rw-r--r--gcc/config/aarch64/tuning_models/generic_armv9_a.h2
-rw-r--r--gcc/config/aarch64/tuning_models/olympus.h210
-rw-r--r--gcc/config/arm/aarch-cost-tables.h36
-rw-r--r--gcc/config/arm/arm.cc12
-rw-r--r--gcc/config/avr/avr-passes.cc145
-rw-r--r--gcc/config/avr/avr-passes.def8
-rw-r--r--gcc/config/avr/avr-protos.h1
-rw-r--r--gcc/config/avr/avr.cc26
-rw-r--r--gcc/config/avr/avr.opt4
-rw-r--r--gcc/config/avr/avr.opt.urls3
-rw-r--r--gcc/config/cris/cris.cc6
-rw-r--r--gcc/config/epiphany/epiphany.cc8
-rw-r--r--gcc/config/gcn/gcn-opts.h7
-rw-r--r--gcc/config/gcn/gcn-valu.md293
-rw-r--r--gcc/config/gcn/gcn.cc311
-rw-r--r--gcc/config/gcn/gcn.md289
-rw-r--r--gcc/config/i386/i386-features.cc2
-rw-r--r--gcc/config/i386/i386-modes.def2
-rw-r--r--gcc/config/i386/i386-options.cc39
-rw-r--r--gcc/config/i386/i386.cc102
-rw-r--r--gcc/config/i386/i386.md3
-rw-r--r--gcc/config/i386/sse.md13
-rw-r--r--gcc/config/loongarch/loongarch.cc8
-rw-r--r--gcc/config/loongarch/loongarch.h2
-rw-r--r--gcc/config/nvptx/nvptx.opt45
-rw-r--r--gcc/config/pru/pru-pragma.cc13
-rw-r--r--gcc/config/pru/pru-protos.h8
-rw-r--r--gcc/config/pru/pru.cc8
-rwxr-xr-xgcc/config/riscv/arch-canonicalize2
-rw-r--r--gcc/config/riscv/autovec-opt.md122
-rw-r--r--gcc/config/riscv/autovec.md26
-rw-r--r--gcc/config/riscv/gen-riscv-mcpu-texi.cc43
-rw-r--r--gcc/config/riscv/gen-riscv-mtune-texi.cc41
-rw-r--r--gcc/config/riscv/generic-vector-ooo.md85
-rw-r--r--gcc/config/riscv/mips-p8700.md2
-rw-r--r--gcc/config/riscv/predicates.md13
-rw-r--r--gcc/config/riscv/riscv-ext.def30
-rw-r--r--gcc/config/riscv/riscv-protos.h16
-rw-r--r--gcc/config/riscv/riscv-string.cc6
-rw-r--r--gcc/config/riscv/riscv-v.cc302
-rw-r--r--gcc/config/riscv/riscv-vector-builtins-bases.cc3
-rw-r--r--gcc/config/riscv/riscv-vector-builtins.cc41
-rw-r--r--gcc/config/riscv/riscv-vector-builtins.h1
-rw-r--r--gcc/config/riscv/riscv-vector-costs.cc87
-rw-r--r--gcc/config/riscv/riscv-vector-costs.h16
-rw-r--r--gcc/config/riscv/riscv.cc100
-rw-r--r--gcc/config/riscv/riscv.md10
-rw-r--r--gcc/config/riscv/t-riscv37
-rw-r--r--gcc/config/riscv/vector-iterators.md16
-rw-r--r--gcc/config/riscv/vector.md410
-rw-r--r--gcc/config/riscv/xiangshan.md3
-rw-r--r--gcc/config/rs6000/rs6000.cc52
-rw-r--r--gcc/config/rs6000/rs6000.md2
-rw-r--r--gcc/config/s390/s390-protos.h2
-rw-r--r--gcc/config/s390/s390.cc188
-rw-r--r--gcc/config/s390/s390.md21
-rw-r--r--gcc/config/xtensa/xtensa.cc68
-rw-r--r--gcc/config/xtensa/xtensa.md16
71 files changed, 3241 insertions, 1071 deletions
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 8040409..6f11cc0 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -224,7 +224,7 @@ AARCH64_CORE("neoverse-v3ae", neoversev3ae, cortexa57, V9_2A, (SVE2_BITPERM, RNG
AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
/* NVIDIA ('N') cores. */
-AARCH64_CORE("olympus", olympus, cortexa57, V9_2A, (SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE, FAMINMAX, FP8FMA, FP8DOT2, FP8DOT4, LUT, SVE2_AES, SVE2_SHA3, SVE2_SM4), neoversev3, 0x4e, 0x10, -1)
+AARCH64_CORE("olympus", olympus, cortexa57, V9_2A, (SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE, FAMINMAX, FP8FMA, FP8DOT2, FP8DOT4, LUT, SVE2_AES, SVE2_SHA3, SVE2_SM4), olympus, 0x4e, 0x10, -1)
/* Armv9-A big.LITTLE processors. */
AARCH64_CORE("gb10", gb10, cortexa57, V9_2A, (SVE2_BITPERM, SVE2_AES, SVE2_SHA3, SVE2_SM4, MEMTAG, PROFILE), cortexx925, 0x41, AARCH64_BIG_LITTLE (0xd85, 0xd87), -1)
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index c49ff7f..e7926eb 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -125,9 +125,9 @@ const struct cpu_cost_table qdf24xx_extra_costs =
{
COSTS_N_INSNS (1), /* alu. */
COSTS_N_INSNS (4), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -233,9 +233,9 @@ const struct cpu_cost_table thunderx_extra_costs =
{
COSTS_N_INSNS (1), /* Alu. */
COSTS_N_INSNS (4), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -340,9 +340,9 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
{
COSTS_N_INSNS (1), /* Alu. */
COSTS_N_INSNS (4), /* Mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -447,9 +447,9 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
{
COSTS_N_INSNS (1), /* Alu. */
COSTS_N_INSNS (4), /* Mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -555,9 +555,9 @@ const struct cpu_cost_table tsv110_extra_costs =
{
COSTS_N_INSNS (1), /* alu. */
COSTS_N_INSNS (4), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -662,9 +662,9 @@ const struct cpu_cost_table a64fx_extra_costs =
{
COSTS_N_INSNS (1), /* alu. */
COSTS_N_INSNS (4), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -769,9 +769,9 @@ const struct cpu_cost_table ampere1_extra_costs =
{
COSTS_N_INSNS (3), /* alu. */
COSTS_N_INSNS (3), /* mult. */
- COSTS_N_INSNS (2), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -876,9 +876,9 @@ const struct cpu_cost_table ampere1a_extra_costs =
{
COSTS_N_INSNS (3), /* alu. */
COSTS_N_INSNS (3), /* mult. */
- COSTS_N_INSNS (2), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -983,9 +983,9 @@ const struct cpu_cost_table ampere1b_extra_costs =
{
COSTS_N_INSNS (1), /* alu. */
COSTS_N_INSNS (2), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (1), /* dup. */
- COSTS_N_INSNS (1) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (0), /* dup. */
+ COSTS_N_INSNS (0) /* extract. */
}
};
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 1c3e697..db88df0 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -128,7 +128,9 @@ AARCH64_OPT_FMV_EXTENSION("sha2", SHA2, (SIMD), (), (), "sha1 sha2")
AARCH64_FMV_FEATURE("sha3", SHA3, (SHA3))
-AARCH64_OPT_FMV_EXTENSION("aes", AES, (SIMD), (), (), "aes")
+AARCH64_OPT_EXTENSION("aes", AES, (SIMD), (), (), "aes")
+
+AARCH64_FMV_FEATURE("aes", PMULL, (AES))
/* +nocrypto disables AES, SHA2 and SM4, and anything that depends on them
(such as SHA3 and the SVE2 crypto extensions). */
@@ -171,8 +173,6 @@ AARCH64_OPT_FMV_EXTENSION("i8mm", I8MM, (SIMD), (), (), "i8mm")
instructions. */
AARCH64_OPT_FMV_EXTENSION("bf16", BF16, (FP), (SIMD), (), "bf16")
-AARCH64_FMV_FEATURE("rpres", RPRES, ())
-
AARCH64_OPT_FMV_EXTENSION("sve", SVE, (SIMD, F16, FCMA), (), (), "sve")
/* This specifically does not imply +sve. */
@@ -190,7 +190,7 @@ AARCH64_OPT_FMV_EXTENSION("sve2", SVE2, (SVE), (), (), "sve2")
AARCH64_OPT_EXTENSION("sve2-aes", SVE2_AES, (SVE2, AES), (), (), "sveaes")
-AARCH64_FMV_FEATURE("sve2-aes", SVE_AES, (SVE2_AES))
+AARCH64_FMV_FEATURE("sve2-aes", SVE_PMULL128, (SVE2_AES))
AARCH64_OPT_EXTENSION("sve2-bitperm", SVE2_BITPERM, (SVE2), (), (),
"svebitperm")
@@ -245,9 +245,9 @@ AARCH64_OPT_EXTENSION("sme-b16b16", SME_B16B16, (SME2, SVE_B16B16), (), (), "sme
AARCH64_OPT_EXTENSION("sme-f16f16", SME_F16F16, (SME2), (), (), "smef16f16")
-AARCH64_OPT_EXTENSION("mops", MOPS, (), (), (), "mops")
+AARCH64_OPT_FMV_EXTENSION("mops", MOPS, (), (), (), "mops")
-AARCH64_OPT_EXTENSION("cssc", CSSC, (), (), (), "cssc")
+AARCH64_OPT_FMV_EXTENSION("cssc", CSSC, (), (), (), "cssc")
AARCH64_OPT_EXTENSION("cmpbr", CMPBR, (), (), (), "cmpbr")
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e946e8d..38c307c 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1031,6 +1031,7 @@ rtx aarch64_pfalse_reg (machine_mode);
bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
rtx aarch64_sve_packed_pred (machine_mode);
rtx aarch64_sve_fp_pred (machine_mode, rtx *);
+rtx aarch64_sve_emit_masked_fp_pred (machine_mode, rtx);
void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
bool aarch64_expand_maskloadstore (rtx *, machine_mode);
void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 270cb2f..8b75c3d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1190,13 +1190,16 @@
[(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
)
+;; Inserting from the zero register into a vector lane is treated as an
+;; expensive GP->FP move on all CPUs. Avoid it when optimizing for speed.
(define_insn "aarch64_simd_vec_set_zero<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
(vec_merge:VALL_F16
(match_operand:VALL_F16 1 "register_operand" "0")
(match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "")
(match_operand:SI 2 "immediate_operand" "i")))]
- "TARGET_SIMD && aarch64_exact_log2_inverse (<nunits>, operands[2]) >= 0"
+ "TARGET_SIMD && aarch64_exact_log2_inverse (<nunits>, operands[2]) >= 0
+ && optimize_function_for_size_p (cfun)"
{
int elt = ENDIAN_LANE_N (<nunits>,
aarch64_exact_log2_inverse (<nunits>,
diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md
index 6b3f439..6b1a747 100644
--- a/gcc/config/aarch64/aarch64-sme.md
+++ b/gcc/config/aarch64/aarch64-sme.md
@@ -62,6 +62,10 @@
;; (b) they are sometimes used conditionally, particularly in streaming-
;; compatible code.
;;
+;; To prevent the latter from upsetting the assembler, we emit the literal
+;; encodings of "SMSTART SM" and "SMSTOP SM" when compiling without
+;; TARGET_SME.
+;;
;; =========================================================================
;; -------------------------------------------------------------------------
@@ -161,7 +165,9 @@
(clobber (reg:VNx16BI P14_REGNUM))
(clobber (reg:VNx16BI P15_REGNUM))]
""
- "smstart\tsm"
+ {
+ return TARGET_SME ? "smstart\tsm" : ".inst 0xd503437f // smstart sm";
+ }
)
;; Turn off streaming mode. This clobbers all SVE state.
@@ -196,7 +202,9 @@
(clobber (reg:VNx16BI P14_REGNUM))
(clobber (reg:VNx16BI P15_REGNUM))]
""
- "smstop\tsm"
+ {
+ return TARGET_SME ? "smstop\tsm" : ".inst 0xd503427f // smstop sm";
+ }
)
;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sme.def b/gcc/config/aarch64/aarch64-sve-builtins-sme.def
index 8e6aadc..117b70e 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sme.def
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sme.def
@@ -92,7 +92,8 @@ DEF_SME_FUNCTION (svstr_zt, str_zt, none, none)
DEF_SME_FUNCTION (svzero_zt, inherent_zt, none, none)
#undef REQUIRED_EXTENSIONS
-#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 && AARCH64_FL_FAMINMAX)
+#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 \
+ | AARCH64_FL_FAMINMAX)
DEF_SME_FUNCTION_GS (svamin, binary_opt_single_n, all_float, x24, none)
DEF_SME_FUNCTION_GS (svamax, binary_opt_single_n, all_float, x24, none)
#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 2b627a9..01833a8 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -4004,7 +4004,8 @@ rtx
function_expander::get_reg_target ()
{
machine_mode target_mode = result_mode ();
- if (!possible_target || GET_MODE (possible_target) != target_mode)
+ if (!possible_target
+ || !register_operand (possible_target, target_mode))
possible_target = gen_reg_rtx (target_mode);
return possible_target;
}
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 10aecf1..80a3288 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3752,9 +3752,9 @@
;; Unpredicated floating-point unary operations.
(define_insn "@aarch64_sve_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
- (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand" "w")]
+ [(set (match_operand:SVE_F 0 "register_operand" "=w")
+ (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand" "w")]
SVE_FP_UNARY))]
"TARGET_SVE"
"<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>"
@@ -3762,25 +3762,41 @@
;; Unpredicated floating-point unary operations.
(define_expand "<optab><mode>2"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_dup 2)
- (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 1 "register_operand")]
+ (match_dup 3)
+ (match_operand:SVE_F 1 "register_operand")]
SVE_COND_FP_UNARY_OPTAB))]
"TARGET_SVE"
{
+ operands[2] = aarch64_sve_fp_pred (<MODE>mode, &operands[3]);
+ }
+)
+
+;; FABS and FNEG are non-trapping, so we can always expand with a <VPRED>
+;; predicate. It doesn't matter whether the padding bits of a partial
+;; vector mode are active or inactive.
+(define_expand "<optab><mode>2"
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_dup 2)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_F 1 "register_operand")]
+ SVE_COND_FP_UNARY_BITWISE))]
+ "TARGET_SVE"
+ {
operands[2] = aarch64_ptrue_reg (<VPRED>mode);
}
)
;; Predicated floating-point unary operations.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 3 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F 2 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")]
SVE_COND_FP_UNARY))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
@@ -3806,13 +3822,13 @@
;; Predicated floating-point unary arithmetic, merging with the first input.
(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 3)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")]
SVE_COND_FP_UNARY)
(match_dup 2)]
UNSPEC_SEL))]
@@ -3854,15 +3870,15 @@
;; as earlyclobber helps to make the instruction more regular to the
;; register allocator.
(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 4)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")]
SVE_COND_FP_UNARY)
- (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[2], operands[3])"
{@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
@@ -5495,27 +5511,25 @@
;; Split a predicated instruction whose predicate is unused into an
;; unpredicated instruction.
(define_split
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (match_operand:SI 4 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
<SVE_COND_FP>))]
- "TARGET_SVE
- && reload_completed
- && INTVAL (operands[4]) == SVE_RELAXED_GP"
+ "TARGET_SVE && reload_completed"
[(set (match_dup 0)
- (SVE_UNPRED_FP_BINARY:SVE_FULL_F_B16B16 (match_dup 2) (match_dup 3)))]
+ (SVE_UNPRED_FP_BINARY:SVE_F_B16B16 (match_dup 2) (match_dup 3)))]
)
;; Unpredicated floating-point binary operations (post-RA only).
;; These are generated by the split above.
(define_insn "*post_ra_<sve_fp_op><mode>3"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand" "=w")
- (SVE_UNPRED_FP_BINARY:SVE_FULL_F_B16B16
- (match_operand:SVE_FULL_F_B16B16 1 "register_operand" "w")
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand" "w")))]
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand" "=w")
+ (SVE_UNPRED_FP_BINARY:SVE_F_B16B16
+ (match_operand:SVE_F_B16B16 1 "register_operand" "w")
+ (match_operand:SVE_F_B16B16 2 "register_operand" "w")))]
"TARGET_SVE && reload_completed"
"<b><sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>")
@@ -5547,10 +5561,10 @@
;; Unpredicated floating-point binary operations.
(define_insn "@aarch64_sve_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
- (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand" "w")
- (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+ [(set (match_operand:SVE_F 0 "register_operand" "=w")
+ (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand" "w")
+ (match_operand:SVE_F 2 "register_operand" "w")]
SVE_FP_BINARY))]
"TARGET_SVE"
"<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
@@ -5559,27 +5573,27 @@
;; Unpredicated floating-point binary operations that need to be predicated
;; for SVE.
(define_expand "<optab><mode>3"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 3)
- (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 1 "<sve_pred_fp_rhs1_operand>")
- (match_operand:SVE_FULL_F_B16B16 2 "<sve_pred_fp_rhs2_operand>")]
+ (match_dup 4)
+ (match_operand:SVE_F_B16B16 1 "<sve_pred_fp_rhs1_operand>")
+ (match_operand:SVE_F_B16B16 2 "<sve_pred_fp_rhs2_operand>")]
SVE_COND_FP_BINARY_OPTAB))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{
- operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[3] = aarch64_sve_fp_pred (<MODE>mode, &operands[4]);
}
)
;; Predicated floating-point binary operations that have no immediate forms.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 4 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "register_operand")]
SVE_COND_FP_BINARY_REG))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
@@ -5591,30 +5605,33 @@
;; Predicated floating-point operations with merging.
(define_expand "@cond_<optab><mode>"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
- (match_operand:SVE_FULL_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
+ (match_operand:SVE_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
+ (match_operand:SVE_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
SVE_COND_FP_BINARY)
- (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
+ {
+ operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]);
+ }
)
;; Predicated floating-point operations, merging with the first input.
(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_operand 4)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 2)]
UNSPEC_SEL))]
@@ -5630,14 +5647,14 @@
)
(define_insn "*cond_<optab><mode>_2_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 2)]
UNSPEC_SEL))]
@@ -5650,14 +5667,14 @@
;; Same for operations that take a 1-bit constant.
(define_insn_and_rewrite "*cond_<optab><mode>_2_const_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 4)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
SVE_COND_FP_BINARY_I1)
(match_dup 2)]
UNSPEC_SEL))]
@@ -5673,14 +5690,14 @@
)
(define_insn "*cond_<optab><mode>_2_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
SVE_COND_FP_BINARY_I1)
(match_dup 2)]
UNSPEC_SEL))]
@@ -5693,14 +5710,14 @@
;; Predicated floating-point operations, merging with the second input.
(define_insn_and_rewrite "*cond_<optab><mode>_3_relaxed"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_operand 4)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 3)]
UNSPEC_SEL))]
@@ -5716,14 +5733,14 @@
)
(define_insn "*cond_<optab><mode>_3_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 3)]
UNSPEC_SEL))]
@@ -5736,16 +5753,16 @@
;; Predicated floating-point operations, merging with an independent value.
(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_BINARY)
- (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
&& (<supports_bf16> || !<is_bf16>)
@@ -5780,16 +5797,16 @@
)
(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_BINARY)
- (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
&& (<supports_bf16> || !<is_bf16>)
@@ -5818,16 +5835,16 @@
;; Same for operations that take a 1-bit constant.
(define_insn_and_rewrite "*cond_<optab><mode>_any_const_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
SVE_COND_FP_BINARY_I1)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
{@ [ cons: =0 , 1 , 2 , 4 ]
@@ -5854,16 +5871,16 @@
)
(define_insn_and_rewrite "*cond_<optab><mode>_any_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
SVE_COND_FP_BINARY_I1)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
{@ [ cons: =0 , 1 , 2 , 4 ]
@@ -5892,12 +5909,12 @@
;; Predicated floating-point addition.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 4 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_operand")]
SVE_COND_FP_ADD))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , %2 , 3 , 4 ; attrs: movprfx ]
@@ -5914,14 +5931,14 @@
;; Predicated floating-point addition of a constant, merging with the
;; first input.
(define_insn_and_rewrite "*cond_add<mode>_2_const_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 4)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
UNSPEC_COND_FADD)
(match_dup 2)]
UNSPEC_SEL))]
@@ -5939,14 +5956,14 @@
)
(define_insn "*cond_add<mode>_2_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
UNSPEC_COND_FADD)
(match_dup 2)]
UNSPEC_SEL))]
@@ -5962,16 +5979,16 @@
;; Predicated floating-point addition of a constant, merging with an
;; independent value.
(define_insn_and_rewrite "*cond_add<mode>_any_const_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
UNSPEC_COND_FADD)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
{@ [ cons: =0 , 1 , 2 , 3 , 4 ]
@@ -6001,16 +6018,16 @@
)
(define_insn_and_rewrite "*cond_add<mode>_any_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
UNSPEC_COND_FADD)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
{@ [ cons: =0 , 1 , 2 , 3 , 4 ]
@@ -6208,12 +6225,12 @@
;; Predicated floating-point subtraction.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 4 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_F 2 "aarch64_sve_float_arith_operand")
+ (match_operand:SVE_F 3 "register_operand")]
SVE_COND_FP_SUB))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , 2 , 3 , 4 ; attrs: movprfx ]
@@ -6229,14 +6246,14 @@
;; Predicated floating-point subtraction from a constant, merging with the
;; second input.
(define_insn_and_rewrite "*cond_sub<mode>_3_const_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 4)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
+ (match_operand:SVE_F 3 "register_operand")]
UNSPEC_COND_FSUB)
(match_dup 3)]
UNSPEC_SEL))]
@@ -6252,14 +6269,14 @@
)
(define_insn "*cond_sub<mode>_3_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
+ (match_operand:SVE_F 3 "register_operand")]
UNSPEC_COND_FSUB)
(match_dup 3)]
UNSPEC_SEL))]
@@ -6273,16 +6290,16 @@
;; Predicated floating-point subtraction from a constant, merging with an
;; independent value.
(define_insn_and_rewrite "*cond_sub<mode>_const_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
+ (match_operand:SVE_F 3 "register_operand")]
UNSPEC_COND_FSUB)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[3], operands[4])"
{@ [ cons: =0 , 1 , 3 , 4 ]
@@ -6309,16 +6326,16 @@
)
(define_insn_and_rewrite "*cond_sub<mode>_const_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
+ (match_operand:SVE_F 3 "register_operand")]
UNSPEC_COND_FSUB)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && !rtx_equal_p (operands[3], operands[4])"
{@ [ cons: =0 , 1 , 3 , 4 ]
@@ -6631,12 +6648,12 @@
;; Predicated floating-point multiplication.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 4 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "aarch64_sve_float_mul_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "aarch64_sve_float_mul_operand")]
SVE_COND_FP_MUL))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , %2 , 3 , 4 ; attrs: movprfx ]
@@ -6671,12 +6688,12 @@
;; -------------------------------------------------------------------------
(define_expand "div<mode>3"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_dup 3)
- (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 1 "nonmemory_operand")
- (match_operand:SVE_FULL_F 2 "register_operand")]
+ (match_dup 4)
+ (match_operand:SVE_F 1 "nonmemory_operand")
+ (match_operand:SVE_F 2 "register_operand")]
UNSPEC_COND_FDIV))]
"TARGET_SVE"
{
@@ -6684,23 +6701,23 @@
DONE;
operands[1] = force_reg (<MODE>mode, operands[1]);
- operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[3] = aarch64_sve_fp_pred (<MODE>mode, &operands[4]);
}
)
(define_expand "@aarch64_frecpe<mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand")]
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand")]
UNSPEC_FRECPE))]
"TARGET_SVE"
)
(define_expand "@aarch64_frecps<mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand")
- (match_operand:SVE_FULL_F 2 "register_operand")]
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand")
+ (match_operand:SVE_F 2 "register_operand")]
UNSPEC_FRECPS))]
"TARGET_SVE"
)
@@ -6865,12 +6882,12 @@
;; Predicated floating-point maximum/minimum.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 4 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "aarch64_sve_float_maxmin_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "aarch64_sve_float_maxmin_operand")]
SVE_COND_FP_MAXMIN))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , %2 , 3 ; attrs: movprfx ]
@@ -6899,7 +6916,7 @@
;; Predicate AND. We can reuse one of the inputs as the GP.
;; Doubling the second operand is the preferred implementation
;; of the MOV alias, so we use that instead of %1/z, %1, %2.
-(define_insn "and<mode>3"
+(define_insn "@and<mode>3"
[(set (match_operand:PRED_ALL 0 "register_operand")
(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand")
(match_operand:PRED_ALL 2 "register_operand")))]
@@ -7581,29 +7598,29 @@
;; Unpredicated floating-point ternary operations.
(define_expand "<optab><mode>4"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 4)
- (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 1 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_dup 5)
+ (match_operand:SVE_F_B16B16 1 "register_operand")
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_TERNARY))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{
- operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[4] = aarch64_sve_fp_pred (<MODE>mode, &operands[5]);
}
)
;; Predicated floating-point ternary operations.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 5 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{@ [ cons: =0 , 1 , %2 , 3 , 4 ; attrs: movprfx , is_rev ]
@@ -7617,17 +7634,17 @@
;; Predicated floating-point ternary operations with merging.
(define_expand "@cond_<optab><mode>"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
- (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{
@@ -7635,20 +7652,22 @@
second of the two. */
if (rtx_equal_p (operands[3], operands[5]))
std::swap (operands[2], operands[3]);
+
+ operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]);
})
;; Predicated floating-point ternary operations, merging with the
;; first input.
(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_F
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "register_operand")
+ (match_operand:SVE_F 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 2)]
UNSPEC_SEL))]
@@ -7664,15 +7683,15 @@
)
(define_insn "*cond_<optab><mode>_2_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "register_operand")
+ (match_operand:SVE_F 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 2)]
UNSPEC_SEL))]
@@ -7686,15 +7705,15 @@
;; Predicated floating-point ternary operations, merging with the
;; third input.
(define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 4)]
UNSPEC_SEL))]
@@ -7710,15 +7729,15 @@
)
(define_insn "*cond_<optab><mode>_4_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 4)]
UNSPEC_SEL))]
@@ -7732,17 +7751,17 @@
;; Predicated floating-point ternary operations, merging with an
;; independent value.
(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ (unspec:SVE_F_B16B16
[(match_operand 6)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
- (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
&& (<supports_bf16> || !<is_bf16>)
@@ -7778,17 +7797,17 @@
)
(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY)
- (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
&& (<supports_bf16> || !<is_bf16>)
@@ -8187,20 +8206,23 @@
;;
;; For unpacked vectors, it doesn't really matter whether SEL uses the
;; the container size or the element size. If SEL used the container size,
-;; it would ignore undefined bits of the predicate but would copy the
-;; upper (undefined) bits of each container along with the defined bits.
-;; If SEL used the element size, it would use undefined bits of the predicate
-;; to select between undefined elements in each input vector. Thus the only
-;; difference is whether the undefined bits in a container always come from
-;; the same input as the defined bits, or whether the choice can vary
-;; independently of the defined bits.
+;; it would would copy the upper (undefined) bits of each container along
+;; with the corresponding defined bits. If SEL used the element size,
+;; it would use separate predicate bits to select between the undefined
+;; elements in each input vector; these seperate predicate bits might
+;; themselves be undefined, depending on the mode of the predicate.
+;;
+;; Thus the only difference is whether the undefined bits in a container
+;; always come from the same input as the defined bits, or whether the
+;; choice can vary independently of the defined bits.
;;
;; For the other instructions, using the element size is more natural,
;; so we do that for SEL as well.
+;;
(define_insn "*vcond_mask_<mode><vpred>"
[(set (match_operand:SVE_ALL 0 "register_operand")
(unspec:SVE_ALL
- [(match_operand:<VPRED> 3 "register_operand")
+ [(match_operand:<VPRED> 3 "aarch64_predicate_operand")
(match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm")
(match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
@@ -9653,6 +9675,31 @@
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>_relaxed"
+ [(set (match_operand:SVE_HSDI 0 "register_operand")
+ (unspec:SVE_HSDI
+ [(match_operand:<SVE_HSDI:VPRED> 1 "register_operand")
+ (unspec:SVE_HSDI
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_PARTIAL_F 2 "register_operand")]
+ SVE_COND_FCVTI)
+ (match_operand:SVE_HSDI 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE
+ && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ &w , Upl , w , 0 ; * ] fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+ [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_HSDI:Vetype>, %1/z, %2.<SVE_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
(define_insn "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>_strict"
[(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
(unspec:SVE_FULL_HSDI
@@ -9706,6 +9753,29 @@
}
)
+(define_insn_and_rewrite "*cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx2SI_ONLY:mode>_relaxed"
+ [(set (match_operand:VNx2SI_ONLY 0 "register_operand")
+ (unspec:VNx2SI_ONLY
+ [(match_operand:VNx2BI 1 "register_operand")
+ (unspec:VNx2SI_ONLY
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:VNx2DF_ONLY 2 "register_operand")]
+ SVE_COND_FCVTI)
+ (match_operand:VNx2SI_ONLY 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ &w , Upl , w , 0 ; * ] fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+ [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<VNx2DF_ONLY:Vetype>, %1/z, %2.<VNx2DF_ONLY:Vetype>\;fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT<-FP] Packs
;; -------------------------------------------------------------------------
@@ -9857,6 +9927,31 @@
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_HSDI:mode><SVE_PARTIAL_F:mode>_relaxed"
+ [(set (match_operand:SVE_PARTIAL_F 0 "register_operand")
+ (unspec:SVE_PARTIAL_F
+ [(match_operand:<SVE_HSDI:VPRED> 1 "register_operand")
+ (unspec:SVE_PARTIAL_F
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_HSDI 2 "register_operand")]
+ SVE_COND_ICVTF)
+ (match_operand:SVE_PARTIAL_F 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE
+ && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ &w , Upl , w , 0 ; * ] <su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+ [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_HSDI:Vetype>, %1/z, %2.<SVE_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
(define_insn "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>_strict"
[(set (match_operand:SVE_FULL_F 0 "register_operand")
(unspec:SVE_FULL_F
@@ -10066,6 +10161,30 @@
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite "*cond_<optab>_trunc<SVE_SDF:mode><SVE_PARTIAL_HSF:mode>"
+ [(set (match_operand:SVE_PARTIAL_HSF 0 "register_operand")
+ (unspec:SVE_PARTIAL_HSF
+ [(match_operand:<SVE_SDF:VPRED> 1 "register_operand")
+ (unspec:SVE_PARTIAL_HSF
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_SDF 2 "register_operand")]
+ SVE_COND_FCVT)
+ (match_operand:SVE_PARTIAL_HSF 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ w , Upl , w , 0 ; * ] fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+ [ ?&w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_SDF:Vetype>, %1/z, %2.<SVE_SDF:Vetype>\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- [FP<-FP] Packs (bfloat16)
;; -------------------------------------------------------------------------
@@ -10259,6 +10378,30 @@
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_PARTIAL_HSF:mode><SVE_SDF:mode>_relaxed"
+ [(set (match_operand:SVE_SDF 0 "register_operand")
+ (unspec:SVE_SDF
+ [(match_operand:<SVE_SDF:VPRED> 1 "register_operand")
+ (unspec:SVE_SDF
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_PARTIAL_HSF 2 "register_operand")]
+ SVE_COND_FCVT)
+ (match_operand:SVE_SDF 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ w , Upl , w , 0 ; * ] fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+ [ ?&w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_SDF:Vetype>, %1/z, %2.<SVE_SDF:Vetype>\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- [PRED<-PRED] Packs
;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 8c03e28..31bdd85 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1346,12 +1346,12 @@
;; Predicated B16B16 binary operations.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:VNx8BF_ONLY 0 "register_operand")
- (unspec:VNx8BF_ONLY
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_BF 0 "register_operand")
+ (unspec:SVE_BF
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 4 "aarch64_sve_gp_strictness")
- (match_operand:VNx8BF_ONLY 2 "register_operand")
- (match_operand:VNx8BF_ONLY 3 "register_operand")]
+ (match_operand:SVE_BF 2 "register_operand")
+ (match_operand:SVE_BF 3 "register_operand")]
SVE_COND_FP_BINARY_OPTAB))]
"TARGET_SSVE_B16B16 && <supports_bf16>"
{@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx , is_rev ]
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 0485f69..f4a2062 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -356,7 +356,8 @@ static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
const_tree type,
int misalignment,
- bool is_packed);
+ bool is_packed,
+ bool is_gather_scatter);
static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
aarch64_addr_query_type);
@@ -429,6 +430,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] =
#include "tuning_models/neoversev2.h"
#include "tuning_models/neoversev3.h"
#include "tuning_models/neoversev3ae.h"
+#include "tuning_models/olympus.h"
#include "tuning_models/a64fx.h"
#include "tuning_models/fujitsu_monaka.h"
@@ -3931,6 +3933,33 @@ aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness)
return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode));
}
+/* PRED is a predicate that governs an operation on DATA_MODE. If DATA_MODE
+ is a partial vector mode, and if exceptions must be suppressed for its
+ undefined elements, convert PRED from a container-level predicate to
+ an element-level predicate and ensure that the undefined elements
+ are inactive. Make no changes otherwise.
+
+ Return the resultant predicate. */
+rtx
+aarch64_sve_emit_masked_fp_pred (machine_mode data_mode, rtx pred)
+{
+ unsigned int vec_flags = aarch64_classify_vector_mode (data_mode);
+ if (flag_trapping_math && (vec_flags & VEC_PARTIAL))
+ {
+ /* Generate an element-level mask. */
+ rtx mask = aarch64_sve_packed_pred (data_mode);
+ machine_mode pmode = GET_MODE (mask);
+
+ /* Apply the existing predicate. */
+ rtx dst = gen_reg_rtx (pmode);
+ emit_insn (gen_and3 (pmode, dst, mask,
+ gen_lowpart (pmode, pred)));
+ return dst;
+ }
+
+ return pred;
+}
+
/* Emit a comparison CMP between OP0 and OP1, both of which have mode
DATA_MODE, and return the result in a predicate of mode PRED_MODE.
Use TARGET as the target register if nonnull and convenient. */
@@ -15854,11 +15883,14 @@ cost_plus:
break;
case CONST_VECTOR:
{
- /* Load using MOVI/MVNI. */
- if (aarch64_simd_valid_mov_imm (x))
- *cost = extra_cost->vect.movi;
- else /* Load using constant pool. */
- *cost = extra_cost->ldst.load;
+ if (speed)
+ {
+ /* Load using MOVI/MVNI. */
+ if (aarch64_simd_valid_mov_imm (x))
+ *cost += extra_cost->vect.movi;
+ else /* Load using constant pool. */
+ *cost += extra_cost->ldst.load;
+ }
break;
}
case VEC_CONCAT:
@@ -15867,7 +15899,8 @@ cost_plus:
break;
case VEC_DUPLICATE:
/* Load using a DUP. */
- *cost = extra_cost->vect.dup;
+ if (speed)
+ *cost += extra_cost->vect.dup;
return false;
case VEC_SELECT:
{
@@ -15875,13 +15908,16 @@ cost_plus:
*cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
/* cost subreg of 0 as free, otherwise as DUP */
- rtx op1 = XEXP (x, 1);
- if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
- ;
- else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
- *cost = extra_cost->vect.dup;
- else
- *cost = extra_cost->vect.extract;
+ if (speed)
+ {
+ rtx op1 = XEXP (x, 1);
+ if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+ ;
+ else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+ *cost += extra_cost->vect.dup;
+ else
+ *cost += extra_cost->vect.extract;
+ }
return true;
}
default:
@@ -17157,8 +17193,8 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
&& STMT_VINFO_DATA_REF (stmt_info))
{
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
- if (stmt_info
- && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES)
+ if (node
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES)
return DR_GROUP_SIZE (stmt_info);
}
return 0;
@@ -17429,8 +17465,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
for each element. We therefore need to divide the full-instruction
cost by the number of elements in the vector. */
if (kind == scalar_load
+ && node
&& sve_costs
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
{
unsigned int nunits = vect_nunits_for_cost (vectype);
/* Test for VNx2 modes, which have 64-bit containers. */
@@ -17442,8 +17479,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
/* Detect cases in which a scalar_store is really storing one element
in a scatter operation. */
if (kind == scalar_store
+ && node
&& sve_costs
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
return sve_costs->scatter_store_elt_cost;
/* Detect cases in which vec_to_scalar represents an in-loop reduction. */
@@ -17699,7 +17737,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
if (stmt_info
&& kind == vec_to_scalar
&& (m_vec_flags & VEC_ADVSIMD)
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
{
auto dr = STMT_VINFO_DATA_REF (stmt_info);
tree dr_ref = DR_REF (dr);
@@ -17712,7 +17750,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
{
if (gimple_vuse (SSA_NAME_DEF_STMT (offset)))
{
- if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type)
+ if (SLP_TREE_TYPE (node) == load_vec_info_type)
ops->loads += count - 1;
else
/* Stores want to count both the index to array and data to
@@ -17814,7 +17852,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
if (stmt_info
&& sve_issue
&& (kind == scalar_load || kind == scalar_store)
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
{
unsigned int pairs = CEIL (count, 2);
ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
@@ -17969,8 +18007,10 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
/* Check if we've seen an SVE gather/scatter operation and which size. */
if (kind == scalar_load
+ && node
+ && vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype))
- && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
{
const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
if (sve_costs)
@@ -19958,8 +19998,9 @@ aarch64_process_one_target_attr (char *arg_str)
if (valid)
{
set_option (&global_options, NULL, p_attr->opt_num, value,
- NULL, DK_UNSPECIFIED, input_location,
- global_dc);
+ NULL,
+ static_cast<int> (diagnostics::kind::unspecified),
+ input_location, global_dc);
}
else
{
@@ -20471,6 +20512,8 @@ aarch64_compare_version_priority (tree decl1, tree decl2)
unsigned long _size; // Size of the struct, so it can grow.
unsigned long _hwcap;
unsigned long _hwcap2;
+ unsigned long _hwcap3;
+ unsigned long _hwcap4;
}
*/
@@ -20487,14 +20530,24 @@ build_ifunc_arg_type ()
tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
get_identifier ("_hwcap2"),
long_unsigned_type_node);
+ tree field4 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+ get_identifier ("_hwcap3"),
+ long_unsigned_type_node);
+ tree field5 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+ get_identifier ("_hwcap4"),
+ long_unsigned_type_node);
DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
+ DECL_FIELD_CONTEXT (field4) = ifunc_arg_type;
+ DECL_FIELD_CONTEXT (field5) = ifunc_arg_type;
TYPE_FIELDS (ifunc_arg_type) = field1;
DECL_CHAIN (field1) = field2;
DECL_CHAIN (field2) = field3;
+ DECL_CHAIN (field3) = field4;
+ DECL_CHAIN (field4) = field5;
layout_type (ifunc_arg_type);
@@ -24406,10 +24459,14 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
static bool
aarch64_builtin_support_vector_misalignment (machine_mode mode,
const_tree type, int misalignment,
- bool is_packed)
+ bool is_packed,
+ bool is_gather_scatter)
{
if (TARGET_SIMD && STRICT_ALIGNMENT)
{
+ if (is_gather_scatter)
+ return true;
+
/* Return if movmisalign pattern is not supported for this mode. */
if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
return false;
@@ -24419,7 +24476,8 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
return false;
}
return default_builtin_support_vector_misalignment (mode, type, misalignment,
- is_packed);
+ is_packed,
+ is_gather_scatter);
}
/* If VALS is a vector constant that can be loaded into a register
@@ -31948,9 +32006,43 @@ aarch64_test_sysreg_encoding_clashes (void)
static void
aarch64_test_sve_folding ()
{
+ aarch64_target_switcher switcher (AARCH64_FL_SVE);
+
tree res = fold_unary (BIT_NOT_EXPR, ssizetype,
ssize_int (poly_int64 (1, 1)));
ASSERT_TRUE (operand_equal_p (res, ssize_int (poly_int64 (-2, -1))));
+
+ auto build_v16bi = [](bool a, bool b)
+ {
+ rtx_vector_builder builder (VNx16BImode, 2, 1);
+ builder.quick_push (a ? const1_rtx : const0_rtx);
+ builder.quick_push (b ? const1_rtx : const0_rtx);
+ return builder.build ();
+ };
+ rtx v16bi_10 = build_v16bi (1, 0);
+ rtx v16bi_01 = build_v16bi (0, 1);
+
+ for (auto mode : { VNx8BImode, VNx4BImode, VNx2BImode })
+ {
+ rtx reg = gen_rtx_REG (mode, LAST_VIRTUAL_REGISTER + 1);
+ rtx subreg = lowpart_subreg (VNx16BImode, reg, mode);
+ rtx and1 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_10);
+ ASSERT_EQ (lowpart_subreg (mode, and1, VNx16BImode), reg);
+ rtx and0 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_01);
+ ASSERT_EQ (lowpart_subreg (mode, and0, VNx16BImode), CONST0_RTX (mode));
+
+ rtx ior1 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_10);
+ ASSERT_EQ (lowpart_subreg (mode, ior1, VNx16BImode), CONSTM1_RTX (mode));
+ rtx ior0 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_01);
+ ASSERT_EQ (lowpart_subreg (mode, ior0, VNx16BImode), reg);
+
+ rtx xor1 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_10);
+ ASSERT_RTX_EQ (lowpart_subreg (mode, xor1, VNx16BImode),
+ lowpart_subreg (mode, gen_rtx_NOT (VNx16BImode, subreg),
+ VNx16BImode));
+ rtx xor0 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_01);
+ ASSERT_EQ (lowpart_subreg (mode, xor0, VNx16BImode), reg);
+ }
}
/* Run all target-specific selftests. */
diff --git a/gcc/config/aarch64/cortex-a57-fma-steering.cc b/gcc/config/aarch64/cortex-a57-fma-steering.cc
index fd6da66..f7675be 100644
--- a/gcc/config/aarch64/cortex-a57-fma-steering.cc
+++ b/gcc/config/aarch64/cortex-a57-fma-steering.cc
@@ -948,6 +948,11 @@ func_fma_steering::analyze ()
/* Search the chain where this instruction is (one of) the root. */
dest_op_info = insn_rr[INSN_UID (insn)].op_info;
+
+ /* Register rename could fail. */
+ if (!dest_op_info)
+ continue;
+
dest_regno = REGNO (SET_DEST (PATTERN (insn)));
for (i = 0; i < dest_op_info->n_chains; i++)
{
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index c59fcd6..8533912 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -463,6 +463,7 @@
(define_mode_iterator VNx8SI_ONLY [VNx8SI])
(define_mode_iterator VNx8SF_ONLY [VNx8SF])
(define_mode_iterator VNx8DI_ONLY [VNx8DI])
+(define_mode_iterator VNx2SI_ONLY [VNx2SI])
(define_mode_iterator VNx4SI_ONLY [VNx4SI])
(define_mode_iterator VNx4SF_ONLY [VNx4SF])
(define_mode_iterator VNx2DI_ONLY [VNx2DI])
@@ -3366,6 +3367,10 @@
(define_int_iterator SVE_INT_UNARY [UNSPEC_REVB
UNSPEC_REVH UNSPEC_REVW])
+;; This iterator is currently only used for estimation instructions,
+;; which are never generated automatically when -ftrapping-math is true.
+;; The iterator is therefore applied unconditionally to partial FP modes.
+;; This might need to be revisited if new operations are added in future.
(define_int_iterator SVE_FP_UNARY [UNSPEC_FRECPE UNSPEC_RSQRTE])
(define_int_iterator SVE_FP_UNARY_INT [(UNSPEC_FEXPA "TARGET_NON_STREAMING")])
@@ -3378,6 +3383,10 @@
(define_int_iterator SVE_INT_BINARY_MULTI [UNSPEC_SQDMULH
UNSPEC_SRSHL UNSPEC_URSHL])
+;; This iterator is currently only used for estimation instructions,
+;; which are never generated automatically when -ftrapping-math is true.
+;; The iterator is therefore applied unconditionally to partial FP modes.
+;; This might need to be revisited if new operations are added in future.
(define_int_iterator SVE_FP_BINARY [UNSPEC_FRECPS UNSPEC_RSQRTS])
(define_int_iterator SVE_FP_BINARY_INT [UNSPEC_FTSMUL UNSPEC_FTSSEL])
@@ -3429,9 +3438,10 @@
UNSPEC_FMINQV
UNSPEC_FMINNMQV])
-(define_int_iterator SVE_COND_FP_UNARY [UNSPEC_COND_FABS
- UNSPEC_COND_FNEG
- UNSPEC_COND_FRECPX
+(define_int_iterator SVE_COND_FP_UNARY_BITWISE [UNSPEC_COND_FABS
+ UNSPEC_COND_FNEG])
+
+(define_int_iterator SVE_COND_FP_UNARY [UNSPEC_COND_FRECPX
UNSPEC_COND_FRINTA
UNSPEC_COND_FRINTI
UNSPEC_COND_FRINTM
@@ -3439,13 +3449,12 @@
UNSPEC_COND_FRINTP
UNSPEC_COND_FRINTX
UNSPEC_COND_FRINTZ
- UNSPEC_COND_FSQRT])
+ UNSPEC_COND_FSQRT
+ SVE_COND_FP_UNARY_BITWISE])
;; Same as SVE_COND_FP_UNARY, but without codes that have a dedicated
;; <optab><mode>2 expander.
-(define_int_iterator SVE_COND_FP_UNARY_OPTAB [UNSPEC_COND_FABS
- UNSPEC_COND_FNEG
- UNSPEC_COND_FRECPX
+(define_int_iterator SVE_COND_FP_UNARY_OPTAB [UNSPEC_COND_FRECPX
UNSPEC_COND_FRINTA
UNSPEC_COND_FRINTI
UNSPEC_COND_FRINTM
diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
index f76a250..9eb1a20 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -26,7 +26,7 @@
static const struct cpu_addrcost_table generic_armv9_a_addrcost_table =
{
{
- 1, /* hi */
+ 0, /* hi */
0, /* si */
0, /* di */
1, /* ti */
diff --git a/gcc/config/aarch64/tuning_models/olympus.h b/gcc/config/aarch64/tuning_models/olympus.h
new file mode 100644
index 0000000..268789d
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/olympus.h
@@ -0,0 +1,210 @@
+/* Tuning model description for the NVIDIA Olympus core.
+ Copyright The GNU Toolchain Authors.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef GCC_AARCH64_H_OLYMPUS
+#define GCC_AARCH64_H_OLYMPUS
+
+#include "generic.h"
+
+static struct cpu_regmove_cost olympus_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Spilling to int<->fp instead of memory is recommended so set
+ realistic costs compared to memmov_cost. */
+ 3, /* GP2FP */
+ 3, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static advsimd_vec_cost olympus_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 2, /* ld2_st2_permute_cost */
+ 2, /* ld3_st3_permute_cost */
+ 3, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 5, /* reduc_i8_cost */
+ 3, /* reduc_i16_cost */
+ 3, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 4, /* reduc_f16_cost */
+ 4, /* reduc_f32_cost */
+ 4, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ 8, /* vec_to_scalar_cost */
+ 4, /* scalar_to_vec_cost */
+ 6, /* align_load_cost */
+ 6, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static sve_vec_cost olympus_sve_vector_cost =
+{
+ {
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 2, /* ld2_st2_permute_cost */
+ 3, /* ld3_st3_permute_cost */
+ 3, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 9, /* reduc_i8_cost */
+ 8, /* reduc_i16_cost */
+ 6, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 8, /* reduc_f16_cost */
+ 6, /* reduc_f32_cost */
+ 4, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ 8, /* vec_to_scalar_cost */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 6, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+ },
+ 3, /* clast_cost */
+ 10, /* fadda_f16_cost */
+ 6, /* fadda_f32_cost */
+ 4, /* fadda_f64_cost */
+ 14, /* gather_load_x32_cost */
+ 12, /* gather_load_x64_cost */
+ 42, /* gather_load_x32_init_cost */
+ 24, /* gather_load_x64_init_cost */
+ 1 /* scatter_store_elt_cost */
+};
+
+static aarch64_scalar_vec_issue_info olympus_scalar_issue_info =
+{
+ 4, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 8, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+};
+
+static aarch64_advsimd_vec_issue_info olympus_advsimd_issue_info =
+{
+ {
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 6, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+};
+
+static aarch64_sve_vec_issue_info olympus_sve_issue_info =
+{
+ {
+ {
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 6, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+ },
+ 2, /* pred_ops_per_cycle */
+ 1, /* while_pred_ops */
+ 0, /* int_cmp_pred_ops */
+ 0, /* fp_cmp_pred_ops */
+ 1, /* gather_scatter_pair_general_ops */
+ 1 /* gather_scatter_pair_pred_ops */
+};
+
+static aarch64_vec_issue_info olympus_vec_issue_info =
+{
+ &olympus_scalar_issue_info,
+ &olympus_advsimd_issue_info,
+ &olympus_sve_issue_info
+};
+
+/* Olympus costs for vector insn classes. */
+static struct cpu_vector_cost olympus_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 2, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &olympus_advsimd_vector_cost, /* advsimd */
+ &olympus_sve_vector_cost, /* sve */
+ &olympus_vec_issue_info /* issue_info */
+};
+
+/* Olympus prefetch settings (which disable prefetch). */
+static cpu_prefetch_tune olympus_prefetch_tune =
+{
+ 0, /* num_slots */
+ -1, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ -1, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static struct tune_params olympus_tunings =
+{
+ &cortexa76_extra_costs,
+ &generic_armv9_a_addrcost_table,
+ &olympus_regmove_cost,
+ &olympus_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_128, /* sve_width */
+ { 4, /* load_int. */
+ 1, /* store_int. */
+ 6, /* load_fp. */
+ 3, /* store_fp. */
+ 5, /* load_pred. */
+ 1 /* store_pred. */
+ }, /* memmov_cost. */
+ 10, /* issue_rate */
+ AARCH64_FUSE_NEOVERSE_BASE, /* fusible_ops */
+ "32:16", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 8, /* int_reassoc_width. */
+ 6, /* fp_reassoc_width. */
+ 4, /* fma_reassoc_width. */
+ 6, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_BASE
+ | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+ | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+ | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
+ &olympus_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_OLYMPUS. */
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index c7a14b3..0600e59 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -123,9 +123,9 @@ const struct cpu_cost_table generic_extra_costs =
{
COSTS_N_INSNS (1), /* alu. */
COSTS_N_INSNS (4), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -230,9 +230,9 @@ const struct cpu_cost_table cortexa53_extra_costs =
{
COSTS_N_INSNS (1), /* alu. */
COSTS_N_INSNS (4), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -337,9 +337,9 @@ const struct cpu_cost_table cortexa57_extra_costs =
{
COSTS_N_INSNS (1), /* alu. */
COSTS_N_INSNS (4), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -444,9 +444,9 @@ const struct cpu_cost_table cortexa76_extra_costs =
{
COSTS_N_INSNS (1), /* alu. */
COSTS_N_INSNS (4), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -551,9 +551,9 @@ const struct cpu_cost_table exynosm1_extra_costs =
{
COSTS_N_INSNS (0), /* alu. */
COSTS_N_INSNS (4), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
@@ -658,9 +658,9 @@ const struct cpu_cost_table xgene1_extra_costs =
{
COSTS_N_INSNS (2), /* alu. */
COSTS_N_INSNS (8), /* mult. */
- COSTS_N_INSNS (1), /* movi. */
- COSTS_N_INSNS (2), /* dup. */
- COSTS_N_INSNS (2) /* extract. */
+ COSTS_N_INSNS (0), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
}
};
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index bde06f3..29b45ae 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -289,7 +289,8 @@ static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
static bool arm_builtin_support_vector_misalignment (machine_mode mode,
const_tree type,
int misalignment,
- bool is_packed);
+ bool is_packed,
+ bool is_gather_scatter);
static void arm_conditional_register_usage (void);
static enum flt_eval_method arm_excess_precision (enum excess_precision_type);
static reg_class_t arm_preferred_rename_class (reg_class_t rclass);
@@ -30661,12 +30662,16 @@ arm_vector_alignment_reachable (const_tree type, bool is_packed)
static bool
arm_builtin_support_vector_misalignment (machine_mode mode,
const_tree type, int misalignment,
- bool is_packed)
+ bool is_packed,
+ bool is_gather_scatter)
{
if (TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access)
{
HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
+ if (is_gather_scatter)
+ return true;
+
if (is_packed)
return align == 1;
@@ -30683,7 +30688,8 @@ arm_builtin_support_vector_misalignment (machine_mode mode,
}
return default_builtin_support_vector_misalignment (mode, type, misalignment,
- is_packed);
+ is_packed,
+ is_gather_scatter);
}
static void
diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc
index 284f49d..69df6d2 100644
--- a/gcc/config/avr/avr-passes.cc
+++ b/gcc/config/avr/avr-passes.cc
@@ -4120,9 +4120,8 @@ avr_optimize_casesi (rtx_insn *insns[5], rtx *xop)
JUMP_LABEL (cbranch) = xop[4];
++LABEL_NUSES (xop[4]);
- rtx_insn *seq1 = get_insns ();
rtx_insn *last1 = get_last_insn ();
- end_sequence ();
+ rtx_insn *seq1 = end_sequence ();
emit_insn_after (seq1, insns[2]);
@@ -4141,9 +4140,8 @@ avr_optimize_casesi (rtx_insn *insns[5], rtx *xop)
emit_insn (pat_4);
- rtx_insn *seq2 = get_insns ();
rtx_insn *last2 = get_last_insn ();
- end_sequence ();
+ rtx_insn *seq2 = end_sequence ();
emit_insn_after (seq2, insns[3]);
@@ -4845,6 +4843,137 @@ avr_pass_fuse_add::execute1 (function *func)
//////////////////////////////////////////////////////////////////////////////
+// Fuse 2 move insns after combine.
+
+static const pass_data avr_pass_data_2moves =
+{
+ RTL_PASS, // type
+ "", // name (will be patched)
+ OPTGROUP_NONE, // optinfo_flags
+ TV_DF_SCAN, // tv_id
+ 0, // properties_required
+ 0, // properties_provided
+ 0, // properties_destroyed
+ 0, // todo_flags_start
+ 0 // todo_flags_finish
+};
+
+class avr_pass_2moves : public rtl_opt_pass
+{
+public:
+ avr_pass_2moves (gcc::context *ctxt, const char *name)
+ : rtl_opt_pass (avr_pass_data_2moves, ctxt)
+ {
+ this->name = name;
+ }
+
+ unsigned int execute (function *func) final override
+ {
+ if (optimize && avropt_fuse_move2)
+ {
+ bool changed = false;
+ basic_block bb;
+
+ FOR_EACH_BB_FN (bb, func)
+ {
+ changed |= optimize_2moves_bb (bb);
+ }
+
+ if (changed)
+ {
+ df_note_add_problem ();
+ df_analyze ();
+ }
+ }
+
+ return 0;
+ }
+
+ bool optimize_2moves (rtx_insn *, rtx_insn *);
+ bool optimize_2moves_bb (basic_block);
+}; // avr_pass_2moves
+
+bool
+avr_pass_2moves::optimize_2moves_bb (basic_block bb)
+{
+ bool changed = false;
+ rtx_insn *insn1 = nullptr;
+ rtx_insn *insn2 = nullptr;
+ rtx_insn *curr;
+
+ FOR_BB_INSNS (bb, curr)
+ {
+ if (insn1 && INSN_P (insn1)
+ && insn2 && INSN_P (insn2))
+ changed |= optimize_2moves (insn1, insn2);
+
+ insn1 = insn2;
+ insn2 = curr;
+ }
+
+ return changed;
+}
+
+bool
+avr_pass_2moves::optimize_2moves (rtx_insn *insn1, rtx_insn *insn2)
+{
+ bool good = false;
+ bool bad = false;
+ rtx set1, dest1, src1;
+ rtx set2, dest2, src2;
+
+ if ((set1 = single_set (insn1))
+ && (set2 = single_set (insn2))
+ && (src1 = SET_SRC (set1))
+ && REG_P (src2 = SET_SRC (set2))
+ && REG_P (dest1 = SET_DEST (set1))
+ && REG_P (dest2 = SET_DEST (set2))
+ && rtx_equal_p (dest1, src2)
+ // Now we have:
+ // insn1: dest1 = src1
+ // insn2: dest2 = dest1
+ && REGNO (dest1) >= FIRST_PSEUDO_REGISTER
+ // Paranoia.
+ && GET_CODE (PATTERN (insn1)) != PARALLEL
+ && GET_CODE (PATTERN (insn2)) != PARALLEL
+ && (rtx_equal_p (dest2, src1)
+ || !reg_overlap_mentioned_p (dest2, src1)))
+ {
+ avr_dump ("\n;; Found 2moves:\n%r\n%r\n", insn1, insn2);
+ avr_dump (";; reg %d: insn uses uids:", REGNO (dest1));
+
+ // Go check that dest1 is used exactly once, namely by insn2.
+
+ df_ref use = DF_REG_USE_CHAIN (REGNO (dest1));
+ for (; use; use = DF_REF_NEXT_REG (use))
+ {
+ rtx_insn *user = DF_REF_INSN (use);
+ avr_dump (" %d", INSN_UID (user));
+ good |= INSN_UID (user) == INSN_UID (insn2);
+ bad |= INSN_UID (user) != INSN_UID (insn2);
+ }
+ avr_dump (".\n");
+
+ if (good && !bad
+ // Propagate src1 to insn2:
+ // insn1: # Deleted
+ // insn2: dest2 = src1
+ && validate_change (insn2, &SET_SRC (set2), src1, false))
+ {
+ SET_INSN_DELETED (insn1);
+ return true;
+ }
+ }
+
+ if (good && !bad)
+ avr_dump (";; Failed\n");
+
+ return false;
+}
+
+
+
+//////////////////////////////////////////////////////////////////////////////
// Split insns with nonzero_bits() after combine.
static const pass_data avr_pass_data_split_nzb =
@@ -5706,6 +5835,14 @@ make_avr_pass_casesi (gcc::context *ctxt)
return new avr_pass_casesi (ctxt, "avr-casesi");
}
+// Optimize 2 consecutive moves after combine.
+
+rtl_opt_pass *
+make_avr_pass_2moves (gcc::context *ctxt)
+{
+ return new avr_pass_2moves (ctxt, "avr-2moves");
+}
+
rtl_opt_pass *
make_avr_pass_split_nzb (gcc::context *ctxt)
{
diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def
index eb60a93..d668c7f 100644
--- a/gcc/config/avr/avr-passes.def
+++ b/gcc/config/avr/avr-passes.def
@@ -74,6 +74,14 @@ INSERT_PASS_BEFORE (pass_free_cfg, 1, avr_pass_recompute_notes);
INSERT_PASS_AFTER (pass_expand, 1, avr_pass_casesi);
+/* Insn combine may come up with superfluous reg-reg moves, where the combine
+ people say that these are no problem since reg-alloc is supposed to optimize
+ them. The issue is that the lower-subreg pass sitting between combine and
+ reg-alloc may split such moves, coming up with a zoo of subregs which are
+ only handled poorly by the register allocator. */
+
+INSERT_PASS_AFTER (pass_combine, 1, avr_pass_2moves);
+
/* Some combine insns have nonzero_bits() in their condition, though insns
should not use such stuff in their condition. Therefore, we split such
insn into something without nonzero_bits() in their condition right after
diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index ca30136..37911e7 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -208,6 +208,7 @@ extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *);
extern rtl_opt_pass *make_avr_pass_ifelse (gcc::context *);
extern rtl_opt_pass *make_avr_pass_split_nzb (gcc::context *);
extern rtl_opt_pass *make_avr_pass_split_after_peephole2 (gcc::context *);
+extern rtl_opt_pass *make_avr_pass_2moves (gcc::context *);
#ifdef RTX_CODE
extern bool avr_casei_sequence_check_operands (rtx *xop);
extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands);
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index c469297..1fb59b6 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -14418,6 +14418,13 @@ avr_output_addr_vec (rtx_insn *labl, rtx table)
// Output the label that precedes the table.
ASM_OUTPUT_ALIGN (stream, 1);
+
+ char s_labl[40];
+ targetm.asm_out.generate_internal_label (s_labl, "L",
+ CODE_LABEL_NUMBER (labl));
+ ASM_OUTPUT_TYPE_DIRECTIVE (stream, s_labl,
+ AVR_HAVE_JMP_CALL ? "object" : "function");
+
targetm.asm_out.internal_label (stream, "L", CODE_LABEL_NUMBER (labl));
// Output the table's content.
@@ -14984,10 +14991,11 @@ avr_addr_space_convert (rtx src, tree type_old, tree type_new)
/* Linearize memory: RAM has bit 23 set. When as_new = __flashx then
this is basically UB since __flashx mistreats RAM addresses, but there
- is no way to bail out. (Though -Waddr-space-convert will tell.) */
+ is no way to bail out. (Though -Waddr-space-convert will tell.)
+ ...but PR121277 is confusing, in particular when NULL is coming in. */
int msb = ADDR_SPACE_GENERIC_P (as_old)
- ? 0x80
+ ? as_new == ADDR_SPACE_MEMX ? 0x80 : 0x00
: avr_addrspace[as_old].segment;
src = force_reg (Pmode, src);
@@ -15085,10 +15093,16 @@ avr_convert_to_type (tree type, tree expr)
const char *name_old = avr_addrspace[as_old].name;
const char *name_new = avr_addrspace[as_new].name;
- warning (OPT_Waddr_space_convert,
- "conversion from address space %qs to address space %qs",
- ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old,
- ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new);
+ // Be relaxed when NULL is used, and when 0x0 stands for
+ // address 0x0.
+ bool nowarn = (expr == null_pointer_node
+ && (as_new == ADDR_SPACE_FLASHX
+ || as_new == ADDR_SPACE_FLASH));
+ if (!nowarn)
+ warning (OPT_Waddr_space_convert,
+ "conversion from address space %qs to address space %qs",
+ ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old,
+ ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new);
return fold_build1_loc (loc, ADDR_SPACE_CONVERT_EXPR, type, expr);
}
diff --git a/gcc/config/avr/avr.opt b/gcc/config/avr/avr.opt
index 9883119..7f6f18c 100644
--- a/gcc/config/avr/avr.opt
+++ b/gcc/config/avr/avr.opt
@@ -164,6 +164,10 @@ mfuse-move=
Target Joined RejectNegative UInteger Var(avropt_fuse_move) Init(0) Optimization IntegerRange(0, 23)
-mfuse-move=<0,23> Optimization. Run a post-reload pass that tweaks move instructions.
+mfuse-move2
+Target Var(avropt_fuse_move2) Init(0) Optimization
+Optimization. Fuse some move insns after insn combine.
+
mabsdata
Target Mask(ABSDATA)
Assume that all data in static storage can be accessed by LDS / STS instructions. This option is only useful for reduced Tiny devices like ATtiny40.
diff --git a/gcc/config/avr/avr.opt.urls b/gcc/config/avr/avr.opt.urls
index 662fdee..87c26b2 100644
--- a/gcc/config/avr/avr.opt.urls
+++ b/gcc/config/avr/avr.opt.urls
@@ -92,6 +92,9 @@ UrlSuffix(gcc/AVR-Options.html#index-mfuse-move)
mfuse-move=
UrlSuffix(gcc/AVR-Options.html#index-mfuse-move)
+mfuse-move2
+UrlSuffix(gcc/AVR-Options.html#index-mfuse-move2)
+
mabsdata
UrlSuffix(gcc/AVR-Options.html#index-mabsdata)
diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc
index a34c9e9..4acdd1d 100644
--- a/gcc/config/cris/cris.cc
+++ b/gcc/config/cris/cris.cc
@@ -3711,9 +3711,11 @@ cris_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
/* Determine if the source using MOF. If it is, automatically
clobbering MOF would cause it to have impossible constraints. */
- /* Look for a use of the MOF constraint letter: h. */
+ /* Look for a use of the MOF constraint letter h or a hard register
+ constraint. */
for (unsigned i = 0, n = constraints.length(); i < n; ++i)
- if (strchr (constraints[i], 'h') != NULL)
+ if (strchr (constraints[i], 'h') != NULL
+ || strstr (constraints[i], "{mof}") != NULL)
return NULL;
/* Look for an output or an input that touches MOF. */
diff --git a/gcc/config/epiphany/epiphany.cc b/gcc/config/epiphany/epiphany.cc
index 16626f8..f53a643 100644
--- a/gcc/config/epiphany/epiphany.cc
+++ b/gcc/config/epiphany/epiphany.cc
@@ -2816,12 +2816,16 @@ epiphany_vector_alignment_reachable (const_tree type, bool is_packed)
static bool
epiphany_support_vector_misalignment (machine_mode mode, const_tree type,
- int misalignment, bool is_packed)
+ int misalignment, bool is_packed,
+ bool is_gather_scatter)
{
+ if (is_gather_scatter)
+ return true;
if (GET_MODE_SIZE (mode) == 8 && misalignment % 4 == 0)
return true;
return default_builtin_support_vector_misalignment (mode, type, misalignment,
- is_packed);
+ is_packed,
+ is_gather_scatter);
}
/* STRUCTURE_SIZE_BOUNDARY seems a bit crude in how it enlarges small
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index 0bfc786..0287400 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -82,11 +82,18 @@ enum hsaco_attr_type
#define TARGET_DPP_FULL !TARGET_RDNA2_PLUS
#define TARGET_DPP16 TARGET_RDNA2_PLUS
#define TARGET_DPP8 TARGET_RDNA2_PLUS
+/* Device requires no manually inserted wait states; that's the
+ case for RDNA 2, 3 and 3.5 (but not for RNDA 4). */
+#define TARGET_NO_MANUAL_NOPS TARGET_RDNA2_PLUS
/* Device requires CDNA1-style manually inserted wait states for AVGPRs. */
#define TARGET_AVGPR_CDNA1_NOPS TARGET_CDNA1
+/* Device requires CDNA3-style manually inserted wait states. */
+#define TARGET_CDNA3_NOPS TARGET_CDNA3
/* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag
for non-scalar memory operations. The string starts on purpose with a space.
Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used.
+ Note: on atomics, glc/sc0 denotes whether the pre-op operation should
+ be used.
CDNA3 also uses 'nt' instead of 'slc' and 'sc1' instead of 'scc'; however,
there is no non-scalar user so far. */
#define TARGET_GLC_NAME (TARGET_CDNA3 ? " sc0" : " glc")
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 7c4dde1..a34d2e3 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -811,7 +811,7 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
; FIXME: 64bit operations really should be splitters, but I am not sure how
; to represent vertical subregs.
@@ -828,7 +828,7 @@
[(set_attr "type" "vmult")
(set_attr "length" "16")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
(define_expand "vec_set<mode>"
[(set (match_operand:V_MOV 0 "register_operand")
@@ -854,7 +854,7 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
(define_insn "*vec_set<mode>_1"
[(set (match_operand:V_2REG 0 "register_operand" "=v")
@@ -871,7 +871,7 @@
[(set_attr "type" "vmult")
(set_attr "length" "16")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
(define_insn "vec_duplicate<mode><exec>"
[(set (match_operand:V_1REG 0 "register_operand" "=v")
@@ -910,7 +910,7 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "read")])
(define_insn "vec_extract<mode><scalar_mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg")
@@ -922,7 +922,7 @@
[(set_attr "type" "vmult")
(set_attr "length" "16")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "read")])
(define_insn "vec_extract<mode><scalar_mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg")
@@ -934,7 +934,7 @@
[(set_attr "type" "vmult")
(set_attr "length" "32")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "read")])
(define_insn "vec_extract<V_1REG:mode><V_1REG_ALT:mode>_nop"
[(set (match_operand:V_1REG_ALT 0 "register_operand" "=v,v")
@@ -1133,6 +1133,23 @@
DONE;
})
+(define_expand "gather_load<mode><vndi>"
+ [(match_operand:V_MOV 0 "register_operand")
+ (match_operand:DI 1 "register_operand")
+ (match_operand:<VnDI> 2 "register_operand")
+ (match_operand 3 "immediate_operand")
+ (match_operand:SI 4 "gcn_alu_operand")]
+ ""
+ {
+ rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1],
+ operands[2], operands[4],
+ INTVAL (operands[3]), NULL);
+
+ emit_insn (gen_gather<mode>_insn_1offset (operands[0], addr, const0_rtx,
+ const0_rtx, const0_rtx));
+ DONE;
+ })
+
; Allow any address expression
(define_expand "gather<mode>_expr<exec>"
[(set (match_operand:V_MOV 0 "register_operand")
@@ -1175,6 +1192,7 @@
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "load")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2,*,cdna2")
(set_attr "xnack" "off,off,on,on")])
@@ -1233,6 +1251,7 @@
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "load")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2,*,cdna2")
(set_attr "xnack" "off,off,on,on")])
@@ -1259,6 +1278,23 @@
DONE;
})
+(define_expand "scatter_store<mode><vndi>"
+ [(match_operand:DI 0 "register_operand")
+ (match_operand:<VnDI> 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand:SI 3 "gcn_alu_operand")
+ (match_operand:V_MOV 4 "register_operand")]
+ ""
+ {
+ rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0],
+ operands[1], operands[3],
+ INTVAL (operands[2]), NULL);
+
+ emit_insn (gen_scatter<mode>_insn_1offset (addr, const0_rtx, operands[4],
+ const0_rtx, const0_rtx));
+ DONE;
+ })
+
; Allow any address expression
(define_expand "scatter<mode>_expr<exec_scatter>"
[(set (mem:BLK (scratch))
@@ -1301,6 +1337,7 @@
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "store")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2")])
@@ -1356,6 +1393,7 @@
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "store")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2")])
@@ -1645,6 +1683,39 @@
[(set_attr "type" "vmult")
(set_attr "length" "8")])
+(define_insn_and_split "add<mode>3_dup"
+ [(set (match_operand:V_DI 0 "register_operand" "= v")
+ (plus:V_DI
+ (vec_duplicate:V_DI
+ (match_operand:DI 1 "register_operand" "SvB"))
+ (match_operand:V_DI 2 "gcn_alu_operand" "vDb")))
+ (clobber (reg:DI VCC_REG))
+ (clobber (match_scratch:<VnSI> 3 "=&v"))]
+ ""
+ "#"
+ "gcn_can_split_p (<MODE>mode, operands[0])
+ && gcn_can_split_p (<MODE>mode, operands[1])
+ && gcn_can_split_p (<MODE>mode, operands[2])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_add<vnsi>3_vcc_dup
+ (gcn_operand_part (<MODE>mode, operands[0], 0),
+ gcn_operand_part (DImode, operands[1], 0),
+ gcn_operand_part (<MODE>mode, operands[2], 0),
+ vcc));
+ emit_insn (gen_vec_duplicate<vnsi> (operands[3],
+ gcn_operand_part (DImode, operands[1], 1)));
+ emit_insn (gen_addc<vnsi>3
+ (gcn_operand_part (<MODE>mode, operands[0], 1),
+ operands[3],
+ gcn_operand_part (<MODE>mode, operands[2], 1),
+ vcc, vcc));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
(define_insn_and_split "add<mode>3_exec"
[(set (match_operand:V_DI 0 "register_operand" "= v")
(vec_merge:V_DI
@@ -1682,6 +1753,49 @@
[(set_attr "type" "vmult")
(set_attr "length" "8")])
+(define_insn_and_split "add<mode>3_dup_exec"
+ [(set (match_operand:V_DI 0 "register_operand" "= v")
+ (vec_merge:V_DI
+ (plus:V_DI
+ (vec_duplicate:V_DI
+ (match_operand:DI 1 "register_operand" "SvB"))
+ (match_operand:V_DI 2 "gcn_alu_operand" "vDb"))
+ (match_operand:V_DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))
+ (clobber (match_scratch:<VnSI> 5 "=&v"))]
+ ""
+ "#"
+ "gcn_can_split_p (<MODE>mode, operands[0])
+ && gcn_can_split_p (<MODE>mode, operands[1])
+ && gcn_can_split_p (<MODE>mode, operands[2])
+ && gcn_can_split_p (<MODE>mode, operands[4])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_add<vnsi>3_vcc_dup_exec
+ (gcn_operand_part (<MODE>mode, operands[0], 0),
+ gcn_operand_part (DImode, operands[1], 0),
+ gcn_operand_part (<MODE>mode, operands[2], 0),
+ vcc,
+ gcn_operand_part (<MODE>mode, operands[3], 0),
+ operands[4]));
+ emit_insn (gen_vec_duplicate<vnsi>_exec (operands[5],
+ gcn_operand_part (DImode, operands[1], 1),
+ gcn_gen_undef (<VnSI>mode),
+ operands[4]));
+ emit_insn (gen_addc<vnsi>3_exec
+ (gcn_operand_part (<MODE>mode, operands[0], 1),
+ operands[5],
+ gcn_operand_part (<MODE>mode, operands[2], 1),
+ vcc, vcc,
+ gcn_operand_part (<MODE>mode, operands[3], 1),
+ operands[4]));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
(define_insn_and_split "sub<mode>3"
[(set (match_operand:V_DI 0 "register_operand" "= v, v")
(minus:V_DI
@@ -2187,6 +2301,22 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")])
+(define_insn "<su>mul<mode>3_highpart_dup<exec>"
+ [(set (match_operand:V_SI 0 "register_operand" "= v")
+ (truncate:V_SI
+ (lshiftrt:<VnDI>
+ (mult:<VnDI>
+ (any_extend:<VnDI>
+ (vec_duplicate:V_SI
+ (match_operand:SI 1 "gcn_alu_operand" "SvA")))
+ (any_extend:<VnDI>
+ (match_operand:V_SI 2 "gcn_alu_operand" " vA")))
+ (const_int 32))))]
+ ""
+ "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
(define_insn "mul<mode>3<exec>"
[(set (match_operand:V_INT_1REG 0 "register_operand" "= v")
(mult:V_INT_1REG
@@ -2198,11 +2328,11 @@
(set_attr "length" "8")])
(define_insn "mul<mode>3_dup<exec>"
- [(set (match_operand:V_INT_1REG 0 "register_operand" "= v")
+ [(set (match_operand:V_INT_1REG 0 "register_operand" "= v")
(mult:V_INT_1REG
- (match_operand:V_INT_1REG 1 "gcn_alu_operand" "%vSvA")
(vec_duplicate:V_INT_1REG
- (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand" " SvA"))))]
+ (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvA"))
+ (match_operand:V_INT_1REG 2 "gcn_alu_operand" " vA")))]
""
"v_mul_lo_u32\t%0, %1, %2"
[(set_attr "type" "vop3a")
@@ -2238,6 +2368,37 @@
DONE;
})
+(define_insn_and_split "mul<mode>3_dup"
+ [(set (match_operand:V_DI 0 "register_operand" "=&v")
+ (mult:V_DI
+ (vec_duplicate:V_DI
+ (match_operand:DI 1 "gcn_alu_operand" " Sv"))
+ (match_operand:V_DI 2 "gcn_alu_operand" "vDA")))
+ (clobber (match_scratch:<VnSI> 3 "=&v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (<MODE>mode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (<MODE>mode, operands[0], 1);
+ rtx left_lo = gcn_operand_part (DImode, operands[1], 0);
+ rtx left_hi = gcn_operand_part (DImode, operands[1], 1);
+ rtx right_lo = gcn_operand_part (<MODE>mode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (<MODE>mode, operands[2], 1);
+ rtx tmp = operands[3];
+
+ emit_insn (gen_mul<vnsi>3_dup (out_lo, left_lo, right_lo));
+ emit_insn (gen_umul<vnsi>3_highpart_dup (out_hi, left_lo, right_lo));
+ emit_insn (gen_mul<vnsi>3_dup (tmp, left_hi, right_lo));
+ emit_insn (gen_add<vnsi>3 (out_hi, out_hi, tmp));
+ emit_insn (gen_mul<vnsi>3_dup (tmp, left_lo, right_hi));
+ emit_insn (gen_add<vnsi>3 (out_hi, out_hi, tmp));
+ emit_insn (gen_mul<vnsi>3_dup (tmp, left_hi, right_hi));
+ emit_insn (gen_add<vnsi>3 (out_hi, out_hi, tmp));
+ DONE;
+ })
+
(define_insn_and_split "mul<mode>3_exec"
[(set (match_operand:V_DI 0 "register_operand" "=&v")
(vec_merge:V_DI
@@ -2286,6 +2447,56 @@
DONE;
})
+(define_insn_and_split "mul<mode>3_dup_exec"
+ [(set (match_operand:V_DI 0 "register_operand" "=&v")
+ (vec_merge:V_DI
+ (mult:V_DI
+ (vec_duplicate:V_DI
+ (match_operand:DI 1 "gcn_alu_operand" " Sv"))
+ (match_operand:V_DI 2 "gcn_alu_operand" "vDA"))
+ (match_operand:V_DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (match_scratch:<VnSI> 5 "=&v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (<MODE>mode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (<MODE>mode, operands[0], 1);
+ rtx left_lo = gcn_operand_part (DImode, operands[1], 0);
+ rtx left_hi = gcn_operand_part (DImode, operands[1], 1);
+ rtx right_lo = gcn_operand_part (<MODE>mode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (<MODE>mode, operands[2], 1);
+ rtx exec = operands[4];
+ rtx tmp = operands[5];
+
+ rtx old_lo, old_hi;
+ if (GET_CODE (operands[3]) == UNSPEC)
+ {
+ old_lo = old_hi = gcn_gen_undef (<VnSI>mode);
+ }
+ else
+ {
+ old_lo = gcn_operand_part (<MODE>mode, operands[3], 0);
+ old_hi = gcn_operand_part (<MODE>mode, operands[3], 1);
+ }
+
+ rtx undef = gcn_gen_undef (<VnSI>mode);
+
+ emit_insn (gen_mul<vnsi>3_dup_exec (out_lo, left_lo, right_lo, old_lo,
+ exec));
+ emit_insn (gen_umul<vnsi>3_highpart_dup_exec (out_hi, left_lo, right_lo,
+ old_hi, exec));
+ emit_insn (gen_mul<vnsi>3_dup_exec (tmp, left_hi, right_lo, undef, exec));
+ emit_insn (gen_add<vnsi>3_exec (out_hi, out_hi, tmp, out_hi, exec));
+ emit_insn (gen_mul<vnsi>3_dup_exec (tmp, left_lo, right_hi, undef, exec));
+ emit_insn (gen_add<vnsi>3_exec (out_hi, out_hi, tmp, out_hi, exec));
+ emit_insn (gen_mul<vnsi>3_dup_exec (tmp, left_hi, right_hi, undef, exec));
+ emit_insn (gen_add<vnsi>3_exec (out_hi, out_hi, tmp, out_hi, exec));
+ DONE;
+ })
+
(define_insn_and_split "mul<mode>3_zext"
[(set (match_operand:V_DI 0 "register_operand" "=&v")
(mult:V_DI
@@ -3053,7 +3264,8 @@
"flag_unsafe_math_optimizations"
"v_sqrt%i0\t%0, %1"
[(set_attr "type" "vop1")
- (set_attr "length" "8")])
+ (set_attr "length" "8")
+ (set_attr "transop" "yes")])
(define_insn "sqrt<mode>2"
[(set (match_operand:FP 0 "register_operand" "= v")
@@ -3062,7 +3274,8 @@
"flag_unsafe_math_optimizations"
"v_sqrt%i0\t%0, %1"
[(set_attr "type" "vop1")
- (set_attr "length" "8")])
+ (set_attr "length" "8")
+ (set_attr "transop" "yes")])
; These FP unops have f64, f32 and f16 versions.
(define_int_iterator MATH_UNOP_1OR2REG
@@ -3352,7 +3565,8 @@
""
"v_rcp%i0\t%0, %1"
[(set_attr "type" "vop1")
- (set_attr "length" "8")])
+ (set_attr "length" "8")
+ (set_attr "transop" "yes")])
;; v_div_scale takes a numerator (op2) and denominator (op1) and returns the
;; one that matches op3 adjusted for best results in reciprocal division.
@@ -3724,6 +3938,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
@@ -3778,6 +3993,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
@@ -3836,6 +4052,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,yes,yes")])
@@ -3859,6 +4076,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,yes,yes")])
@@ -4049,6 +4267,32 @@
DONE;
})
+(define_expand "mask_gather_load<mode><vndi>"
+ [(set:V_MOV (match_operand:V_MOV 0 "register_operand")
+ (unspec:V_MOV
+ [(match_operand:DI 1 "register_operand")
+ (match_operand:<VnDI> 2 "register_operand")
+ (match_operand 3 "immediate_operand")
+ (match_operand:SI 4 "gcn_alu_operand")
+ (match_operand:DI 5 "")
+ (match_operand:V_MOV 6 "maskload_else_operand")]
+ UNSPEC_GATHER))]
+ ""
+ {
+ rtx exec = force_reg (DImode, operands[5]);
+
+ rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1],
+ operands[2], operands[4],
+ INTVAL (operands[3]), exec);
+
+ emit_insn (gen_gather<mode>_insn_1offset_exec (operands[0], addr,
+ const0_rtx, const0_rtx,
+ const0_rtx,
+ gcn_gen_undef (<MODE>mode),
+ exec));
+ DONE;
+ })
+
(define_expand "mask_scatter_store<mode><vnsi>"
[(match_operand:DI 0 "register_operand")
(match_operand:<VnSI> 1 "register_operand")
@@ -4077,6 +4321,27 @@
DONE;
})
+(define_expand "mask_scatter_store<mode><vndi>"
+ [(match_operand:DI 0 "register_operand")
+ (match_operand:<VnDI> 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand:SI 3 "gcn_alu_operand")
+ (match_operand:V_MOV 4 "register_operand")
+ (match_operand:DI 5 "")]
+ ""
+ {
+ rtx exec = force_reg (DImode, operands[5]);
+
+ rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0],
+ operands[1], operands[3],
+ INTVAL (operands[2]), exec);
+
+ emit_insn (gen_scatter<mode>_insn_1offset_exec (addr, const0_rtx,
+ operands[4], const0_rtx,
+ const0_rtx, exec));
+ DONE;
+ })
+
(define_code_iterator cond_op [plus minus mult])
(define_expand "cond_<expander><mode>"
@@ -4397,7 +4662,7 @@
rtx tmp = gen_reg_rtx (<MODE>mode);
rtx v1 = gen_rtx_REG (<MODE>mode, VGPR_REGNO (1));
- emit_insn (gen_mul<mode>3_dup (tmp, v1, operands[2]));
+ emit_insn (gen_mul<mode>3_dup (tmp, operands[2], v1));
emit_insn (gen_add<mode>3_dup (operands[0], tmp, operands[1]));
DONE;
})
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 0ce5a29..5ffeb23 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -54,6 +54,7 @@
#include "gimple.h"
#include "cgraph.h"
#include "case-cfn-macros.h"
+#include "opts.h"
/* This file should be included last. */
#include "target-def.h"
@@ -183,6 +184,11 @@ gcn_option_override (void)
if (flag_sram_ecc == HSACO_ATTR_DEFAULT)
flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default;
+
+ /* TODO: This seems to produce tighter loops, but the testsuites expects it
+ to be set to '2', so I'll leave it default for now.
+ SET_OPTION_IF_UNSET (&global_options, &global_options_set,
+ param_vect_partial_vector_usage, 1); */
}
/* }}} */
@@ -1275,13 +1281,13 @@ gen_##PREFIX##vN##SUFFIX (PARAMS) \
}
#define GEN_VNM_NOEXEC(PREFIX, SUFFIX, PARAMS, ARGS) \
-GEN_VN_NOEXEC (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \
-GEN_VN_NOEXEC (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \
-GEN_VN_NOEXEC (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \
+USE_QHF (GEN_VN_NOEXEC (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS))) \
+USE_QHF (GEN_VN_NOEXEC (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS))) \
+USE_QHF (GEN_VN_NOEXEC (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS))) \
GEN_VN_NOEXEC (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \
-GEN_VN_NOEXEC (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \
+USE_QHF (GEN_VN_NOEXEC (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS))) \
GEN_VN_NOEXEC (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \
-GEN_VN_NOEXEC (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \
+USE_QHF (GEN_VN_NOEXEC (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS))) \
static rtx \
gen_##PREFIX##vNm##SUFFIX (PARAMS) \
{ \
@@ -1289,13 +1295,13 @@ gen_##PREFIX##vNm##SUFFIX (PARAMS) \
\
switch (mode) \
{ \
- case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS); \
- case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS); \
- case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS); \
+ USE_QHF (case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS);) \
+ USE_QHF (case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS);) \
+ USE_QHF (case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS);) \
case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS); \
- case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS); \
+ USE_QHF (case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS);) \
case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS); \
- case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS); \
+ USE_QHF (case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS);) \
default: \
break; \
} \
@@ -1340,13 +1346,13 @@ gen_##PREFIX##vN##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \
}
#define GEN_VNM(PREFIX, SUFFIX, PARAMS, ARGS) \
-GEN_VN (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \
-GEN_VN (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \
-GEN_VN (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \
+USE_QHF (GEN_VN (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS))) \
+USE_QHF (GEN_VN (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS))) \
+USE_QHF (GEN_VN (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS))) \
GEN_VN (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \
-GEN_VN (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \
+USE_QHF (GEN_VN (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS))) \
GEN_VN (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \
-GEN_VN (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \
+USE_QHF (GEN_VN (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS))) \
USE_TI (GEN_VN (PREFIX, ti##SUFFIX, A(PARAMS), A(ARGS))) \
static rtx \
gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \
@@ -1355,15 +1361,22 @@ gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \
\
switch (mode) \
{ \
- case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS, merge_src, exec); \
- case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS, merge_src, exec); \
- case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS, merge_src, exec); \
- case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS, merge_src, exec); \
- case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS, merge_src, exec); \
- case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS, merge_src, exec); \
- case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS, merge_src, exec); \
- case E_TImode: \
- USE_TI (return gen_##PREFIX##vNti##SUFFIX (ARGS, merge_src, exec);) \
+ USE_QHF (case E_QImode: \
+ return gen_##PREFIX##vNqi##SUFFIX (ARGS, merge_src, exec);) \
+ USE_QHF (case E_HImode: \
+ return gen_##PREFIX##vNhi##SUFFIX (ARGS, merge_src, exec);) \
+ USE_QHF (case E_HFmode: \
+ return gen_##PREFIX##vNhf##SUFFIX (ARGS, merge_src, exec);) \
+ case E_SImode: \
+ return gen_##PREFIX##vNsi##SUFFIX (ARGS, merge_src, exec); \
+ USE_QHF (case E_SFmode: \
+ return gen_##PREFIX##vNsf##SUFFIX (ARGS, merge_src, exec);) \
+ case E_DImode: \
+ return gen_##PREFIX##vNdi##SUFFIX (ARGS, merge_src, exec); \
+ USE_QHF (case E_DFmode: \
+ return gen_##PREFIX##vNdf##SUFFIX (ARGS, merge_src, exec);) \
+ USE_TI (case E_TImode: \
+ return gen_##PREFIX##vNti##SUFFIX (ARGS, merge_src, exec);) \
default: \
break; \
} \
@@ -1372,7 +1385,8 @@ gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \
return NULL_RTX; \
}
-/* These have TImode support. */
+/* These support everything. */
+#define USE_QHF(ARGS) ARGS
#define USE_TI(ARGS) ARGS
GEN_VNM (mov,, A(rtx dest, rtx src), A(dest, src))
GEN_VNM (vec_duplicate,, A(rtx dest, rtx src), A(dest, src))
@@ -1382,6 +1396,7 @@ GEN_VNM (vec_duplicate,, A(rtx dest, rtx src), A(dest, src))
#define USE_TI(ARGS)
GEN_VNM (add,3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
GEN_VN (add,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
+GEN_VN (add,di3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
GEN_VN (add,si3_vcc_dup, A(rtx dest, rtx src1, rtx src2, rtx vcc),
A(dest, src1, src2, vcc))
GEN_VN (add,di3_sext_dup2, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
@@ -1393,15 +1408,20 @@ GEN_VN (add,di3_vcc_zext_dup2, A(rtx dest, rtx src1, rtx src2, rtx vcc),
GEN_VN (addc,si3, A(rtx dest, rtx src1, rtx src2, rtx vccout, rtx vccin),
A(dest, src1, src2, vccout, vccin))
GEN_VN (and,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
-GEN_VN (ashl,si3, A(rtx dest, rtx src, rtx shift), A(dest, src, shift))
GEN_VNM_NOEXEC (ds_bpermute,, A(rtx dest, rtx addr, rtx src, rtx exec),
A(dest, addr, src, exec))
GEN_VNM (gather,_expr, A(rtx dest, rtx addr, rtx as, rtx vol),
A(dest, addr, as, vol))
-GEN_VN (mul,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
GEN_VN (sub,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
GEN_VN_NOEXEC (vec_series,si, A(rtx dest, rtx x, rtx c), A(dest, x, c))
+/* These do not have QI, HI, or any FP support. */
+#undef USE_QHF
+#define USE_QHF(ARGS)
+GEN_VNM (ashl,3, A(rtx dest, rtx src, rtx shift), A(dest, src, shift))
+GEN_VNM (mul,3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
+
+#undef USE_QHF
#undef USE_TI
#undef GEN_VNM
#undef GEN_VN
@@ -1995,8 +2015,8 @@ gcn_expand_vector_init (rtx op0, rtx vec)
rtx addr = gen_reg_rtx (addrmode);
int unit_size = GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (op0)));
- emit_insn (gen_mulvNsi3_dup (ramp, gen_rtx_REG (offsetmode, VGPR_REGNO (1)),
- GEN_INT (unit_size)));
+ emit_insn (gen_mulvNsi3_dup (ramp, GEN_INT (unit_size),
+ gen_rtx_REG (offsetmode, VGPR_REGNO (1))));
bool simple_repeat = true;
@@ -2293,36 +2313,46 @@ gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
Return values.
ADDR_SPACE_FLAT - return VnDImode vector of absolute addresses.
- ADDR_SPACE_GLOBAL - return VnSImode vector of offsets. */
+ ADDR_SPACE_GLOBAL - return VnSImode vector of offsets.
+ 64-bit offsets - return VnDImode vector of absolute addresses. */
rtx
gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
bool unsigned_p, rtx exec)
{
int vf = GET_MODE_NUNITS (GET_MODE (offsets));
- rtx tmpsi = gen_reg_rtx (VnMODE (vf, SImode));
- rtx tmpdi = gen_reg_rtx (VnMODE (vf, DImode));
+ rtx scaled_offsets = gen_reg_rtx (GET_MODE (offsets));
+ rtx abs_addr = gen_reg_rtx (VnMODE (vf, DImode));
+ bool use_di = GET_MODE_INNER (GET_MODE (scaled_offsets)) == DImode;
if (CONST_INT_P (scale)
&& INTVAL (scale) > 0
&& exact_log2 (INTVAL (scale)) >= 0)
- emit_insn (gen_ashlvNsi3 (tmpsi, offsets,
- GEN_INT (exact_log2 (INTVAL (scale))),
- NULL, exec));
+ emit_insn (gen_ashlvNm3 (scaled_offsets, offsets,
+ GEN_INT (exact_log2 (INTVAL (scale))),
+ NULL, exec));
else
- emit_insn (gen_mulvNsi3_dup (tmpsi, offsets, scale, NULL, exec));
+ emit_insn (gen_mulvNm3_dup (scaled_offsets, scale, offsets, NULL, exec));
+ /* No instructions support DImode offsets. */
+ if (use_di)
+ {
+ emit_insn (gen_addvNdi3_dup (abs_addr, base, scaled_offsets, NULL, exec));
+ return abs_addr;
+ }
/* "Global" instructions do not support negative register offsets. */
- if (as == ADDR_SPACE_FLAT || !unsigned_p)
+ else if (as == ADDR_SPACE_FLAT || !unsigned_p)
{
if (unsigned_p)
- emit_insn (gen_addvNdi3_zext_dup2 (tmpdi, tmpsi, base, NULL, exec));
+ emit_insn (gen_addvNdi3_zext_dup2 (abs_addr, scaled_offsets, base,
+ NULL, exec));
else
- emit_insn (gen_addvNdi3_sext_dup2 (tmpdi, tmpsi, base, NULL, exec));
- return tmpdi;
+ emit_insn (gen_addvNdi3_sext_dup2 (abs_addr, scaled_offsets, base,
+ NULL, exec));
+ return abs_addr;
}
else if (as == ADDR_SPACE_GLOBAL)
- return tmpsi;
+ return scaled_offsets;
gcc_unreachable ();
}
@@ -5315,8 +5345,12 @@ gcn_preferred_vector_alignment (const_tree type)
static bool
gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
const_tree type, int misalignment,
- bool is_packed)
+ bool is_packed,
+ bool is_gather_scatter)
{
+ if (is_gather_scatter)
+ return true;
+
if (is_packed)
return false;
@@ -5761,6 +5795,16 @@ gcn_libc_has_function (enum function_class fn_class,
return bsd_libc_has_function (fn_class, type);
}
+/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */
+
+static bool
+gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode),
+ int ARG_UNUSED (scale),
+ unsigned int ARG_UNUSED (group_size))
+{
+ return true;
+}
+
/* }}} */
/* {{{ md_reorg pass. */
@@ -6124,12 +6168,22 @@ gcn_md_reorg (void)
detects the missed cases, and inserts the documented number of NOPs
required for correct execution. */
+ /* RDNA4 (not yet implemented) differs from RNDA 2/3/3.5 and requires some
+ s_nop, see 5.7 and esp. 5.7.2. in its ISA manual.
+ The assert here is a reminder to add those. */
+ STATIC_ASSERT (ISA_CDNA1 - ISA_RDNA3 == 1);
+
+ if (TARGET_NO_MANUAL_NOPS)
+ return;
+
const int max_waits = 5;
struct ilist
{
rtx_insn *insn;
attr_unit unit;
- attr_delayeduse delayeduse;
+ attr_type type;
+ attr_flatmemaccess flatmemaccess;
+ bool delayeduse;
HARD_REG_SET writes;
HARD_REG_SET reads;
int age;
@@ -6150,7 +6204,29 @@ gcn_md_reorg (void)
attr_type itype = get_attr_type (insn);
attr_unit iunit = get_attr_unit (insn);
- attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
+ attr_flatmemaccess iflatmemaccess = get_attr_flatmemaccess (insn);
+ bool delayeduse;
+ if (TARGET_CDNA3_NOPS)
+ switch (iflatmemaccess)
+ {
+ case FLATMEMACCESS_STORE:
+ case FLATMEMACCESS_STOREX34:
+ case FLATMEMACCESS_ATOMIC:
+ case FLATMEMACCESS_CMPSWAPX2:
+ delayeduse = true;
+ break;
+ case FLATMEMACCESS_LOAD:
+ case FLATMEMACCESS_ATOMICWAIT:
+ case FLATMEMACCESS_NO:
+ delayeduse = false;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ else
+ delayeduse = (iflatmemaccess == FLATMEMACCESS_CMPSWAPX2
+ || iflatmemaccess == FLATMEMACCESS_STOREX34);
+
int ivccwait = get_attr_vccwait (insn);
HARD_REG_SET ireads, iwrites;
CLEAR_HARD_REG_SET (ireads);
@@ -6195,16 +6271,26 @@ gcn_md_reorg (void)
&& TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
nops_rqd = 5 - prev_insn->age;
- /* VALU writes SGPR/VCC followed by v_{read,write}lane using
- SGPR/VCC as lane select requires 4 wait states. */
+ /* VALU writes SGPR/VCC followed by
+ - v_{read,write}lane using SGPR/VCC as lane select requires
+ 4 wait states
+ - [CDNA3] VALU reads SGPR as constant requires 1 wait state
+ - [CDNA3] VALU reads SGPR as carry-in requires no wait states */
if ((prev_insn->age + nops_rqd) < 4
&& prev_insn->unit == UNIT_VECTOR
- && get_attr_laneselect (insn) == LANESELECT_YES
+ && get_attr_laneselect (insn) != LANESELECT_NO
&& (hard_reg_set_intersect_p
(depregs, reg_class_contents[(int) SGPR_REGS])
|| hard_reg_set_intersect_p
(depregs, reg_class_contents[(int) VCC_CONDITIONAL_REG])))
nops_rqd = 4 - prev_insn->age;
+ else if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && iunit == UNIT_VECTOR
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_REGS]))
+ nops_rqd = 1 - prev_insn->age;
/* VALU writes VGPR followed by VALU_DPP reading that VGPR
requires 2 wait states. */
@@ -6217,22 +6303,128 @@ gcn_md_reorg (void)
nops_rqd = 2 - prev_insn->age;
}
+ /* VALU writes EXEC followed by VALU DPP op requires 5 nop. */
+ if ((prev_insn->age + nops_rqd) < 5
+ && itype == TYPE_VOP_DPP
+ && prev_insn->unit == UNIT_VECTOR
+ && TEST_HARD_REG_BIT (prev_insn->writes, EXECZ_REG))
+ nops_rqd = 5 - prev_insn->age;
+
/* Store that requires input registers are not overwritten by
- following instruction. */
- if ((prev_insn->age + nops_rqd) < 1
- && prev_insn->delayeduse == DELAYEDUSE_YES
+ following instruction.
+ For CDNA3, only, VALU writes require 2 not 1 nop.
+ CDNA3 additionally requires that 1 or 2 nop for global & scatch
+ store/atomic. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 2
+ && prev_insn->delayeduse
+ && iunit == UNIT_VECTOR
+ && ((hard_reg_set_intersect_p
+ (prev_insn->reads, iwrites))))
+ nops_rqd = 2 - prev_insn->age;
+ else if ((prev_insn->age + nops_rqd) < 1
+ && prev_insn->delayeduse
&& ((hard_reg_set_intersect_p
(prev_insn->reads, iwrites))))
nops_rqd = 1 - prev_insn->age;
- /* Instruction that requires VCC is not written too close before
- using it. */
+ /* Instruction (such as v_div_fmas) that requires VCC is not written
+ too close before using it */
if (prev_insn->age < ivccwait
&& (hard_reg_set_intersect_p
(prev_insn->writes,
reg_class_contents[(int)VCC_CONDITIONAL_REG])))
nops_rqd = ivccwait - prev_insn->age;
+ /* NOTE: The following condition for adding wait state exists, but
+ GCC does not access the special registers using their SGPR#.
+ Thus, no action is required here. The following wait-state
+ condition exists at least for VEGA/gfx900+ to CDNA3:
+ Mixed use of VCC: alias vs. SGPR# - v_readlane,
+ v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale
+ followed by VALU reads VCC as constant requires 1 wait state.
+ (As carry-in, it requires none.)
+ [VCC can be accessed by name or logical SGPR that holds it.] */
+
+ /* Testing indicates that CDNA3 requires an s_nop between
+ e.g. 'v_cmp_eq_u64 vcc, v[4:5], v[8:9]' and 'v_mov_b32 v0, vcc_lo'.
+ Thus: add it between v_cmp writing VCC and VALU read of VCC. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && iunit == UNIT_VECTOR
+ && (hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int)VCC_CONDITIONAL_REG]))
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMP)
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CDNA3: VALU writes SGPR/VCC: v_readlane, v_readfirstlane, v_cmp,
+ v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by:
+ - VALU reads SGPR as constant requires 1 waite state
+ - VALU reads SGPR as carry-in requires no waite state
+ - v_readlane/v_writelane reads SGPR as lane select requires 4 wait
+ states. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 4
+ && iunit == UNIT_VECTOR
+ && prev_insn->unit == UNIT_VECTOR
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_SRC_REGS]))
+ {
+ if (get_attr_laneselect (insn) != LANESELECT_NO)
+ nops_rqd = 4 - prev_insn->age;
+ else if ((prev_insn->age + nops_rqd) < 1)
+ nops_rqd = 1 - prev_insn->age;
+ }
+
+ /* CDNA3: v_cmpx followed by
+ - V_readlane, v_readfirstlane, v_writelane requires 4 wait states
+ - VALU reads EXEC as constant requires 2 wait states
+ - other VALU requires no wait state */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 4
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
+ && get_attr_laneselect (insn) != LANESELECT_NO)
+ nops_rqd = 4 - prev_insn->age;
+ else if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 2
+ && iunit == UNIT_VECTOR
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
+ && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
+ nops_rqd = 2 - prev_insn->age;
+
+ /* CDNA3: VALU writes VGPR followed by v_readlane vsrc0 reads VGPRn
+ requires 1 wait state. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && prev_insn->flatmemaccess != FLATMEMACCESS_LOAD
+ && get_attr_laneselect (insn) == LANESELECT_READ
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) VGPR_REGS]))
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CDNA3: VALU op which uses OPSEL or SDWA with changes the result's
+ bit position followed by VALU op consumes result of that op
+ requires 1 wait state.
+ FIXME: Handle OPSEL, once used. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && prev_insn->type == TYPE_VOP_SDWA
+ && !hard_reg_set_empty_p (depregs))
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CNDA3: VALU Trans Op (such as v_rcp_f64) followed by non-trans VALU
+ op consumes result of that op requires 1 wait state. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && iunit == UNIT_VECTOR
+ && get_attr_transop (prev_insn->insn) == TRANSOP_YES
+ && get_attr_transop (insn) == TRANSOP_NO
+ && !hard_reg_set_empty_p (depregs))
+ nops_rqd = 1 - prev_insn->age;
+
/* CDNA1: write VGPR before v_accvgpr_write reads it. */
if (TARGET_AVGPR_CDNA1_NOPS
&& (prev_insn->age + nops_rqd) < 2
@@ -6264,8 +6456,8 @@ gcn_md_reorg (void)
}
/* Insert the required number of NOPs. */
- for (int i = nops_rqd; i > 0; i--)
- emit_insn_after (gen_nop (), last_insn);
+ if (nops_rqd > 0)
+ emit_insn_after (gen_nops (GEN_INT (nops_rqd-1)), last_insn);
/* Age the previous instructions. We can also ignore writes to
registers subsequently overwritten. */
@@ -6288,7 +6480,9 @@ gcn_md_reorg (void)
/* Track the current instruction as a previous instruction. */
back[oldest].insn = insn;
back[oldest].unit = iunit;
- back[oldest].delayeduse = idelayeduse;
+ back[oldest].type = itype;
+ back[oldest].flatmemaccess = iflatmemaccess;
+ back[oldest].delayeduse = delayeduse;
back[oldest].writes = iwrites;
back[oldest].reads = ireads;
back[oldest].age = 0;
@@ -7109,6 +7303,11 @@ print_operand_address (FILE *file, rtx mem)
H - print second part of a multi-reg value (high-part of 2-reg value)
J - print third part of a multi-reg value
K - print fourth part of a multi-reg value
+ R Print a scalar register number as an integer. Temporary hack.
+ V - Print a vector register number as an integer. Temporary hack.
+
+ Additionally, the standard builtin c, n, a, and l exist; see gccint's
+ "Output Templates and Operand Substitution" for details.
*/
void
@@ -7957,6 +8156,8 @@ gcn_dwarf_register_span (rtx rtl)
gcn_vectorize_builtin_vectorized_function
#undef TARGET_VECTORIZE_GET_MASK_MODE
#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
+#undef TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 9193461..4130cf6 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -312,18 +312,33 @@
; We need to be able to identify v_readlane and v_writelane with
; SGPR lane selection in order to handle "Manually Inserted Wait States".
-(define_attr "laneselect" "yes,no" (const_string "no"))
+(define_attr "laneselect" "write,read,no" (const_string "no"))
-; Identify instructions that require a "Manually Inserted Wait State" if
-; their inputs are overwritten by subsequent instructions.
+; Global or flat memory access using store or load followed by waitcnt
+; and using flat/global atomic access, possibly followed by a waitcnt.
+; 'storex34' denotes FLAT_STORE_X{3,4}.
+; 'cmpswapx2' denotes FLAT_ATOMIC_{F}CMPSWAP_X2
+; Used to handle "Manually Inserted Wait State".
-(define_attr "delayeduse" "yes,no" (const_string "no"))
+(define_attr "flatmemaccess"
+ "store,storex34,load,atomic,atomicwait,cmpswapx2,no"
+ (const_string "no"))
+
+; Identify v_cmp and v_cmpx instructions for "Manually Inserted Wait State"
+; handling.
+
+(define_attr "vcmp" "vcmp,vcmpx,no" (const_string "no"))
; Identify instructions that require "Manually Inserted Wait State" if
; a previous instruction writes to VCC. The number gives the number of NOPs.
(define_attr "vccwait" "" (const_int 0))
+; Mark trans ops such as v_{exp,rsq,sqrt,sin,cos,log,...}_F{16,32,64}
+; for later conditional s_nop insertion.
+
+(define_attr "transop" "yes,no" (const_string "no"))
+
;; }}}
;; {{{ Iterators useful across the wole machine description
@@ -414,6 +429,15 @@
"s_nop\t0x0"
[(set_attr "type" "sopp")])
+; Variant of 'nop' that accepts a count argument.
+; s_nop accepts 0x0 to 0xf for 1 to 16 nops; however,
+; as %0 prints decimals, only 0 to 9 (= 1 to 10 nops) can be used.
+(define_insn "nops"
+ [(match_operand 0 "const_int_operand")]
+ ""
+ "s_nop\t0x%0"
+ [(set_attr "type" "sopp")])
+
; FIXME: What should the value of the immediate be? Zero is disallowed, so
; pick 1 for now.
(define_insn "trap"
@@ -555,9 +579,12 @@
}
[(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat,
flat,flat,flat,flat")
+ (set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store")
+ (set_attr "vcmp" "*,*,*,*,vcmp,*,*,*,*,*,*,*,*,*,*")
(set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*")
(set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12")
- (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")])
+ (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")
+ (set_attr "laneselect" "*,*,read,*,*,*,*,*,*,*,*,*,*,*,*")])
; 32bit move pattern
@@ -565,38 +592,38 @@
[(set (match_operand:SISF 0 "nonimmediate_operand")
(match_operand:SISF 1 "gcn_load_operand"))]
""
- {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack]
- [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1
- [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1
- [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1
- [SD ,RB ;smem ,* ,12,* ,off] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
- [&SD ,RB ;smem ,* ,12,* ,on ] ^
- [RB ,Sm ;smem ,* ,12,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0
- [Sm ,RS ;smem ,* ,12,* ,off] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [&Sm ,RS ;smem ,* ,12,* ,on ] ^
- [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dword\t%1, %A0
- [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1
- [Sg ,v ;vop3a,none,8 ,* ,* ] v_readlane_b32\t%0, %1, 0
- [v ,Sv ;vop3a,none,8 ,* ,* ] v_writelane_b32\t%0, %1, 0
- [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1
- [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1
- [a ,a ;vop1 ,* ,4,cdna2,* ] v_accvgpr_mov_b32\t%0, %1
- [v ,RF ;flat ,* ,12,* ,off] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,* ,12,* ,on ] ^
- [^a ,RF ;flat ,* ,12,cdna2,off] ^
- [&^a ,RF ;flat ,* ,12,cdna2,on ] ^
- [RF ,v ;flat ,* ,12,* ,* ] flat_store_dword\t%A0, %1%O0%g0
- [RF ,a ;flat ,* ,12,cdna2,* ] ^
- [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1
- [RLRG,v ;ds ,* ,12,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,* ,12,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [SD ,Y ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1
- [v ,RM ;flat ,* ,12,* ,off] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,* ,12,* ,on ] ^
- [^a ,RM ;flat ,* ,12,cdna2,off] ^
- [&^a ,RM ;flat ,* ,12,cdna2,on ] ^
- [RM ,v ;flat ,* ,12,* ,* ] global_store_dword\t%A0, %1%O0%g0
- [RM ,a ;flat ,* ,12,cdna2,* ] ^
+ {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess]
+ [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1
+ [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [SD ,RB ;smem ,* ,12,* ,off,* ,* ] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
+ [&SD ,RB ;smem ,* ,12,* ,on ,* ,* ] ^
+ [RB ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0
+ [Sm ,RS ;smem ,* ,12,* ,off,* ,* ] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [&Sm ,RS ;smem ,* ,12,* ,on ,* ,* ] ^
+ [RS ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_store_dword\t%1, %A0
+ [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [Sg ,v ;vop3a,none,8 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0
+ [v ,Sv ;vop3a,none,8 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0
+ [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1
+ [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1
+ [a ,a ;vop1 ,* ,4,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1
+ [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store_dword\t%A0, %1%O0%g0
+ [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
+ [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [SD ,Y ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store_dword\t%A0, %1%O0%g0
+ [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
})
; 8/16bit move pattern
@@ -606,31 +633,31 @@
[(set (match_operand:QIHI 0 "nonimmediate_operand")
(match_operand:QIHI 1 "gcn_load_operand"))]
"gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
- {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack]
- [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1
- [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1
- [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1
- [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1
- [Sg ,v ;vop3a,none,4 ,* ,* ] v_readlane_b32\t%0, %1, 0
- [v ,Sv ;vop3a,none,4 ,* ,* ] v_writelane_b32\t%0, %1, 0
- [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1
- [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1
- [a ,a ;vop1 ,* ,8,cdna2,* ] v_accvgpr_mov_b32\t%0, %1
- [v ,RF ;flat ,* ,12,* ,off] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,* ,12,* ,on ] ^
- [^a ,RF ;flat ,* ,12,cdna2,off] ^
- [&^a ,RF ;flat ,* ,12,cdna2,on ] ^
- [RF ,v ;flat ,* ,12,* ,* ] flat_store%s0\t%A0, %1%O0%g0
- [RF ,a ;flat ,* ,12,cdna2,* ] ^
- [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1
- [RLRG,v ;ds ,* ,12,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,* ,12,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,RM ;flat ,* ,12,* ,off] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,* ,12,* ,on ] ^
- [^a ,RM ;flat ,* ,12,cdna2,off] ^
- [&^a ,RM ;flat ,* ,12,cdna2,on ] ^
- [RM ,v ;flat ,* ,12,* ,* ] global_store%s0\t%A0, %1%O0%g0
- [RM ,a ;flat ,* ,12,cdna2,* ] ^
+ {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess]
+ [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1
+ [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [Sg ,v ;vop3a,none,4 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0
+ [v ,Sv ;vop3a,none,4 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0
+ [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1
+ [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1
+ [a ,a ;vop1 ,* ,8,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1
+ [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store%s0\t%A0, %1%O0%g0
+ [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
+ [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store%s0\t%A0, %1%O0%g0
+ [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
})
; 64bit move pattern
@@ -639,34 +666,34 @@
[(set (match_operand:DIDF 0 "nonimmediate_operand")
(match_operand:DIDF 1 "general_operand"))]
"GET_CODE(operands[1]) != SYMBOL_REF"
- {@ [cons: =0, 1; attrs: type, length, cdna, xnack]
- [SD ,SSA ;sop1 ,4 ,* ,* ] s_mov_b64\t%0, %1
- [SD ,C ;sop1 ,8 ,* ,* ] ^
- [SD ,DB ;mult ,* ,* ,* ] #
- [RS ,Sm ;smem ,12,* ,* ] s_store_dwordx2\t%1, %A0
- [Sm ,RS ;smem ,12,* ,off] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [&Sm ,RS ;smem ,12,* ,on ] ^
- [v ,v ;vmult,* ,* ,* ] #
- [v ,DB ;vmult,* ,* ,* ] #
- [Sg ,v ;vmult,* ,* ,* ] #
- [v ,Sv ;vmult,* ,* ,* ] #
- [v ,^a ;vmult,* ,* ,* ] #
- [a ,v ;vmult,* ,* ,* ] #
- [a ,a ;vmult,* ,cdna2,* ] #
- [v ,RF ;flat ,12,* ,off] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,12,* ,on ] ^
- [^a ,RF ;flat ,12,cdna2,off] ^
- [&^a ,RF ;flat ,12,cdna2,on ] ^
- [RF ,v ;flat ,12,* ,* ] flat_store_dwordx2\t%A0, %1%O0%g0
- [RF ,a ;flat ,12,cdna2,* ] ^
- [RLRG,v ;ds ,12,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,12,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,RM ;flat ,12,* ,off] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,12,* ,on ] ^
- [^a ,RM ;flat ,12,cdna2,off] ^
- [&^a ,RM ;flat ,12,cdna2,on ] ^
- [RM ,v ;flat ,12,* ,* ] global_store_dwordx2\t%A0, %1%O0%g0
- [RM ,a ;flat ,12,cdna2,* ] ^
+ {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess]
+ [SD ,SSA ;sop1 ,4 ,* ,* ,* ] s_mov_b64\t%0, %1
+ [SD ,C ;sop1 ,8 ,* ,* ,* ] ^
+ [SD ,DB ;mult ,* ,* ,* ,* ] #
+ [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx2\t%1, %A0
+ [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [&Sm ,RS ;smem ,12,* ,on ,* ] ^
+ [v ,v ;vmult,* ,* ,* ,* ] #
+ [v ,DB ;vmult,* ,* ,* ,* ] #
+ [Sg ,v ;vmult,* ,* ,* ,* ] #
+ [v ,Sv ;vmult,* ,* ,* ,* ] #
+ [v ,^a ;vmult,* ,* ,* ,* ] #
+ [a ,v ;vmult,* ,* ,* ,* ] #
+ [a ,a ;vmult,* ,cdna2,* ,* ] #
+ [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,12,* ,on ,load ] ^
+ [^a ,RF ;flat ,12,cdna2,off,load ] ^
+ [&^a ,RF ;flat ,12,cdna2,on ,load ] ^
+ [RF ,v ;flat ,12,* ,* ,store] flat_store_dwordx2\t%A0, %1%O0%g0
+ [RF ,a ;flat ,12,cdna2,* ,store] ^
+ [RLRG,v ;ds ,12,* ,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,12,* ,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,12,* ,on ,load ] ^
+ [^a ,RM ;flat ,12,cdna2,off,load ] ^
+ [&^a ,RM ;flat ,12,cdna2,on ,load ] ^
+ [RM ,v ;flat ,12,* ,* ,store] global_store_dwordx2\t%A0, %1%O0%g0
+ [RM ,a ;flat ,12,cdna2,* ,store] ^
}
"reload_completed
&& ((!MEM_P (operands[0]) && !MEM_P (operands[1])
@@ -704,31 +731,31 @@
[(set (match_operand:TI 0 "nonimmediate_operand")
(match_operand:TI 1 "general_operand" ))]
""
- {@ [cons: =0, 1; attrs: type, delayeduse, length, cdna, xnack]
- [SD ,SSB;mult ,* ,* ,* ,* ] #
- [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dwordx4\t%1, %A0
- [Sm ,RS ;smem ,yes,12,* ,off] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [&Sm,RS ;smem ,yes,12,* ,on ] ^
- [RF ,v ;flat ,* ,12,* ,* ] flat_store_dwordx4\t%A0, %1%O0%g0
- [RF ,a ;flat ,* ,12,cdna2,* ] ^
- [v ,RF ;flat ,* ,12,* ,off] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,* ,12,* ,on ] ^
- [^a ,RF ;flat ,* ,12,cdna2,off] ^
- [&^a,RF ;flat ,* ,12,cdna2,on ] ^
- [v ,v ;vmult,* ,* ,* ,* ] #
- [v ,Sv ;vmult,* ,* ,* ,* ] #
- [SD ,v ;vmult,* ,* ,* ,* ] #
- [RM ,v ;flat ,yes,12,* ,* ] global_store_dwordx4\t%A0, %1%O0%g0
- [RM ,a ;flat ,yes,12,cdna2,* ] ^
- [v ,RM ;flat ,* ,12,* ,off] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,* ,12,* ,on ] ^
- [^a ,RM ;flat ,* ,12,cdna2,off] ^
- [&^a,RM ;flat ,* ,12,cdna2,on ] ^
- [RL ,v ;ds ,* ,12,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RL ;ds ,* ,12,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,^a ;vmult,* ,* ,* ,* ] #
- [a ,v ;vmult,* ,* ,* ,* ] #
- [a ,a ;vmult,* ,* ,cdna2,* ] #
+ {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess]
+ [SD ,SSB;mult ,* ,* ,* ,* ] #
+ [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx4\t%1, %A0
+ [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [&Sm,RS ;smem ,12,* ,on ,* ] ^
+ [RF ,v ;flat ,12,* ,* ,storex34] flat_store_dwordx4\t%A0, %1%O0%g0
+ [RF ,a ;flat ,12,cdna2,* ,storex34] ^
+ [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,12,* ,on ,load ] ^
+ [^a ,RF ;flat ,12,cdna2,off,load ] ^
+ [&^a,RF ;flat ,12,cdna2,on ,load ] ^
+ [v ,v ;vmult,* ,* ,* ,* ] #
+ [v ,Sv ;vmult,* ,* ,* ,* ] #
+ [SD ,v ;vmult,* ,* ,* ,* ] #
+ [RM ,v ;flat ,12,* ,* ,storex34] global_store_dwordx4\t%A0, %1%O0%g0
+ [RM ,a ;flat ,12,cdna2,* ,storex34] ^
+ [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,12,* ,on ,load ] ^
+ [^a ,RM ;flat ,12,cdna2,off,load ] ^
+ [&^a,RM ;flat ,12,cdna2,on ,load ] ^
+ [RL ,v ;ds ,12,* ,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RL ;ds ,12,* ,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,^a ;vmult,* ,* ,* ,* ] #
+ [a ,v ;vmult,* ,* ,* ,* ] #
+ [a ,a ;vmult,* ,cdna2,* ,* ] #
}
"reload_completed
&& REG_P (operands[0])
@@ -1077,6 +1104,7 @@
s_cmp%D1\t%2, %3
v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "sopc,vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_insn "cstoredi4_vector"
@@ -1087,6 +1115,7 @@
""
"v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_expand "cbranchdi4"
@@ -1113,6 +1142,7 @@
""
"v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_expand "cbranch<mode>4"
@@ -1985,6 +2015,7 @@
flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 %G2\;s_waitcnt\t0
global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)"
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,atomicwait,atomicwait")
(set_attr "length" "12")])
; FIXME: These patterns are disabled because the instructions don't
@@ -2006,6 +2037,7 @@
flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0
global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)"
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,atomicwait,atomicwait")
(set_attr "length" "12")])
(define_mode_attr x2 [(SI "DI") (DI "TI")])
@@ -2053,7 +2085,7 @@
global_atomic_cmpswap<X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)"
[(set_attr "type" "smem,flat,flat")
(set_attr "length" "12")
- (set_attr "delayeduse" "*,yes,yes")])
+ (set_attr "flatmemaccess" "*,cmpswapx2,cmpswapx2")])
(define_insn "sync_compare_and_swap<mode>_lds_insn"
[(set (match_operand:SIDI 0 "register_operand" "= v")
@@ -2151,7 +2183,7 @@
? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_load%o0\t%0, %A1%O1 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\t0\;buffer_inv sc1"
: "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\t0\;buffer_wbinvl1_vol");
@@ -2163,7 +2195,7 @@
? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
@@ -2173,6 +2205,7 @@
gcc_unreachable ();
}
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,load,load")
(set_attr "length" "28")
(set_attr "rdna" "no,*,*")])
@@ -2209,7 +2242,7 @@
: TARGET_WBINVL1_CACHE
? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_store%o1\t%A0, %1%O0 %G1"
: "error: cache architectire unspecified");
case 2:
return (TARGET_GLn_CACHE
@@ -2217,7 +2250,7 @@
: TARGET_WBINVL1_CACHE
? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_store%o1\t%A0, %1%O0 %G1"
: "error: cache architecture unspecified");
}
break;
@@ -2237,7 +2270,8 @@
? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\t0\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;"
+ "flat_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\t0\;buffer_inv sc1"
: "error: cache architecture unspecified");
case 2:
@@ -2248,7 +2282,8 @@
? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
+ "global_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "error: cache architecture unspecified");
}
@@ -2257,6 +2292,7 @@
gcc_unreachable ();
}
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,store,store")
(set_attr "length" "28")
(set_attr "rdna" "no,*,*")])
@@ -2331,7 +2367,7 @@
? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0"
: "error: cache architecture unspecified");
case 2:
@@ -2344,7 +2380,7 @@
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)"
: "error: cache architecture unspecified");
@@ -2366,7 +2402,7 @@
? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0\;buffer_inv sc1"
: "error: cache architecture unspecified");
case 2:
@@ -2379,7 +2415,7 @@
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "error: cache architecture unspecified");
@@ -2389,6 +2425,7 @@
gcc_unreachable ();
}
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,atomicwait,atomicwait")
(set_attr "length" "28")
(set_attr "rdna" "no,*,*")])
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c131577..53e86c8 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3226,7 +3226,7 @@ remove_partial_avx_dependency (void)
break;
}
- /* Only hanlde conversion here. */
+ /* Only handle conversion here. */
machine_mode src_mode
= convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
switch (src_mode)
diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index 2fedbeb..c2db305 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -91,7 +91,6 @@ VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */
VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */
VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */
VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */
-VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */
VECTOR_MODE (FLOAT, HF, 2); /* V2HF */
VECTOR_MODE (FLOAT, BF, 2); /* V2BF */
VECTOR_MODE (FLOAT, HF, 6); /* V6HF */
@@ -102,7 +101,6 @@ VECTOR_MODE (INT, QI, 2); /* V2QI */
VECTOR_MODE (INT, QI, 12); /* V12QI */
VECTOR_MODE (INT, QI, 14); /* V14QI */
VECTOR_MODE (INT, HI, 6); /* V6HI */
-VECTOR_MODE (INT, SI, 64); /* V64SI */
INT_MODE (OI, 32);
INT_MODE (XI, 64);
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index d244b225..09a35ef 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1362,7 +1362,9 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
if (arg_ok)
set_option (opts, enum_opts_set, opt, value,
- p + opt_len, DK_UNSPECIFIED, input_location,
+ p + opt_len,
+ static_cast<int> (diagnostics::kind::unspecified),
+ input_location,
global_dc);
else
{
@@ -3613,6 +3615,18 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
return NULL_TREE;
}
+ if (TARGET_64BIT)
+ {
+ /* Do not warn when emulating the MS ABI. */
+ if ((TREE_CODE (*node) != FUNCTION_TYPE
+ && TREE_CODE (*node) != METHOD_TYPE)
+ || ix86_function_type_abi (*node) != MS_ABI)
+ warning (OPT_Wattributes, "%qE attribute ignored",
+ name);
+ *no_add_attrs = true;
+ return NULL_TREE;
+ }
+
/* Can combine regparm with all attributes but fastcall, and thiscall. */
if (is_attribute_p ("regparm", name))
{
@@ -3625,7 +3639,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
{
- error ("regparam and thiscall attributes are not compatible");
+ error ("regparm and thiscall attributes are not compatible");
}
cst = TREE_VALUE (args);
@@ -3646,19 +3660,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
return NULL_TREE;
}
- if (TARGET_64BIT)
- {
- /* Do not warn when emulating the MS ABI. */
- if ((TREE_CODE (*node) != FUNCTION_TYPE
- && TREE_CODE (*node) != METHOD_TYPE)
- || ix86_function_type_abi (*node) != MS_ABI)
- warning (OPT_Wattributes, "%qE attribute ignored",
- name);
- *no_add_attrs = true;
- return NULL_TREE;
- }
-
- /* Can combine fastcall with stdcall (redundant) and sseregparm. */
+ /* Can combine fastcall with sseregparm. */
if (is_attribute_p ("fastcall", name))
{
if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3679,8 +3681,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
}
}
- /* Can combine stdcall with fastcall (redundant), regparm and
- sseregparm. */
+ /* Can combine stdcall with regparm and sseregparm. */
else if (is_attribute_p ("stdcall", name))
{
if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3730,6 +3731,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
{
error ("cdecl and thiscall attributes are not compatible");
}
+ if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("regparm and thiscall attributes are not compatible");
+ }
}
/* Can combine sseregparm with all attributes. */
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 49bd393..65e04d3 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -12442,6 +12442,28 @@ static GTY(()) rtx ix86_tls_symbol;
static rtx
ix86_tls_get_addr (void)
{
+ if (cfun->machine->call_saved_registers
+ == TYPE_NO_CALLER_SAVED_REGISTERS)
+ {
+ /* __tls_get_addr doesn't preserve vector registers. When a
+ function with no_caller_saved_registers attribute calls
+ __tls_get_addr, YMM and ZMM registers will be clobbered.
+ Issue an error and suggest -mtls-dialect=gnu2 in this case. */
+ if (cfun->machine->func_type == TYPE_NORMAL)
+ error (G_("%<-mtls-dialect=gnu2%> must be used with a function"
+ " with the %<no_caller_saved_registers%> attribute"));
+ else
+ error (cfun->machine->func_type == TYPE_EXCEPTION
+ ? G_("%<-mtls-dialect=gnu2%> must be used with an"
+ " exception service routine")
+ : G_("%<-mtls-dialect=gnu2%> must be used with an"
+ " interrupt service routine"));
+ /* Don't issue the same error twice. */
+ cfun->machine->func_type = TYPE_NORMAL;
+ cfun->machine->call_saved_registers
+ = TYPE_DEFAULT_CALL_SAVED_REGISTERS;
+ }
+
if (!ix86_tls_symbol)
{
const char *sym
@@ -20007,7 +20029,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
tree utype, ures, vce;
utype = unsigned_type_for (TREE_TYPE (arg0));
/* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR
- instead of ABS_EXPR to hanlde overflow case(TYPE_MIN). */
+ instead of ABS_EXPR to handle overflow case(TYPE_MIN). */
ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0);
gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
loc = gimple_location (stmt);
@@ -21491,8 +21513,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
/* Register pair for mask registers. */
if (mode == P2QImode || mode == P2HImode)
return 2;
- if (mode == V64SFmode || mode == V64SImode)
- return 4;
+
return 1;
}
@@ -23132,7 +23153,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
So current solution is make constant disp as cheap as possible. */
if (GET_CODE (addr) == PLUS
&& x86_64_immediate_operand (XEXP (addr, 1), Pmode)
- /* Only hanlde (reg + disp) since other forms of addr are mostly LEA,
+ /* Only handle (reg + disp) since other forms of addr are mostly LEA,
there's no additional cost for the plus of disp. */
&& register_operand (XEXP (addr, 0), Pmode))
{
@@ -24788,6 +24809,12 @@ static void map_egpr_constraints (vec<const char *> &constraints)
buf.safe_push (cur[j + 1]);
j++;
break;
+ case '{':
+ do
+ {
+ buf.safe_push (cur[j]);
+ } while (cur[j++] != '}');
+ break;
default:
buf.safe_push (cur[j]);
break;
@@ -25205,20 +25232,14 @@ asm_preferred_eh_data_format (int code, int global)
return DW_EH_PE_absptr;
}
-/* Implement targetm.vectorize.builtin_vectorization_cost. */
+/* Worker for ix86_builtin_vectorization_cost and the fallback calls
+ from ix86_vector_costs::add_stmt_cost. */
static int
-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype, int)
+ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost,
+ machine_mode mode)
{
- bool fp = false;
- machine_mode mode = TImode;
+ bool fp = FLOAT_MODE_P (mode);
int index;
- if (vectype != NULL)
- {
- fp = FLOAT_TYPE_P (vectype);
- mode = TYPE_MODE (vectype);
- }
-
switch (type_of_cost)
{
case scalar_stmt:
@@ -25277,14 +25298,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
COSTS_N_INSNS
(ix86_cost->gather_static
+ ix86_cost->gather_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ * GET_MODE_NUNITS (mode)) / 2);
case vector_scatter_store:
return ix86_vec_cost (mode,
COSTS_N_INSNS
(ix86_cost->scatter_static
+ ix86_cost->scatter_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ * GET_MODE_NUNITS (mode)) / 2);
case cond_branch_taken:
return ix86_cost->cond_taken_branch_cost;
@@ -25302,7 +25323,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
case vec_construct:
{
- int n = TYPE_VECTOR_SUBPARTS (vectype);
+ int n = GET_MODE_NUNITS (mode);
/* N - 1 element inserts into an SSE vector, the possible
GPR -> XMM move is accounted for in add_stmt_cost. */
if (GET_MODE_BITSIZE (mode) <= 128)
@@ -25330,6 +25351,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
}
}
+/* Implement targetm.vectorize.builtin_vectorization_cost. */
+static int
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+ tree vectype, int)
+{
+ machine_mode mode = TImode;
+ if (vectype != NULL)
+ mode = TYPE_MODE (vectype);
+ return ix86_default_vector_cost (type_of_cost, mode);
+}
+
/* This function returns the calling abi specific va_list type node.
It returns the FNDECL specific va_list type. */
@@ -25783,7 +25815,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
unsigned
ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, slp_tree node,
- tree vectype, int misalign,
+ tree vectype, int,
vect_cost_model_location where)
{
unsigned retval = 0;
@@ -26122,32 +26154,24 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
(AGU and load ports). Try to account for this by scaling the
construction cost by the number of elements involved. */
if ((kind == vec_construct || kind == vec_to_scalar)
- && ((stmt_info
- && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
- || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
- && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
- && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+ && ((node
+ && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+ || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
+ && SLP_TREE_LANES (node) == 1))
+ && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
+ (SLP_TREE_REPRESENTATIVE (node))))
!= INTEGER_CST))
- || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
- == VMAT_GATHER_SCATTER)))
- || (node
- && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
- || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
- && SLP_TREE_LANES (node) == 1))
- && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
- (SLP_TREE_REPRESENTATIVE (node))))
- != INTEGER_CST))
- || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
- == VMAT_GATHER_SCATTER)))))
- {
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
+ == VMAT_GATHER_SCATTER)))))
+ {
+ stmt_cost = ix86_default_vector_cost (kind, mode);
stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
}
else if ((kind == vec_construct || kind == scalar_to_vec)
&& node
&& SLP_TREE_DEF_TYPE (node) == vect_external_def)
{
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ stmt_cost = ix86_default_vector_cost (kind, mode);
unsigned i;
tree op;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
@@ -26211,7 +26235,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
TREE_VISITED (op) = 0;
}
if (stmt_cost == -1)
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ stmt_cost = ix86_default_vector_cost (kind, mode);
if (kind == vec_perm && vectype
&& GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index eb52699..a50475b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2968,7 +2968,8 @@
(match_operand:SWI248 1 "const_int_operand"))]
"optimize_insn_for_size_p () && optimize_size > 1
&& operands[1] != const0_rtx
- && operands[1] != constm1_rtx
+ && (operands[1] != constm1_rtx
+ || (<MODE>mode == DImode && LEGACY_INT_REG_P (operands[0])))
&& IN_RANGE (INTVAL (operands[1]), -128, 127)
&& !ix86_red_zone_used
&& REGNO (operands[0]) != SP_REG"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d88c3d6..ec74f93 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -21729,6 +21729,19 @@
(const_string "orig")))
(set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
+;; Eliminate redundancy caused by
+;; /* Special case TImode to 128-bit vector conversions via V2DI. */
+;; in ix86_expand_vector_move
+
+(define_split
+ [(set (match_operand:V2DI 0 "register_operand")
+ (vec_concat:V2DI
+ (subreg:DI (match_operand:TI 1 "register_operand") 0)
+ (subreg:DI (match_dup 1) 8)))]
+ "TARGET_SSE2 && ix86_pre_reload_split ()"
+ [(set (match_dup 0)
+ (subreg:V2DI (match_dup 1) 0))])
+
(define_insn "*vec_concatv2di_0"
[(set (match_operand:V2DI 0 "register_operand" "=v,v ,x")
(vec_concat:V2DI
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index b00fcc7..493f95e 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -11052,17 +11052,21 @@ static bool
loongarch_builtin_support_vector_misalignment (machine_mode mode,
const_tree type,
int misalignment,
- bool is_packed)
+ bool is_packed,
+ bool is_gather_scatter)
{
if ((ISA_HAS_LSX || ISA_HAS_LASX) && STRICT_ALIGNMENT)
{
+ if (is_gather_scatter)
+ return true;
if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
return false;
if (misalignment == -1)
return false;
}
return default_builtin_support_vector_misalignment (mode, type, misalignment,
- is_packed);
+ is_packed,
+ is_gather_scatter);
}
/* Return a PARALLEL containing NELTS elements, with element I equal
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index d897763..5fc8665 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -823,8 +823,6 @@ typedef struct {
#define CASE_VECTOR_MODE Pmode
-#define CASE_VECTOR_SHORTEN_MODE(MIN, MAX, BODY) Pmode
-
/* Define this as 1 if `char' should by default be signed; else as 0. */
#ifndef DEFAULT_SIGNED_CHAR
#define DEFAULT_SIGNED_CHAR 1
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index d326ca4..9796839 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -120,6 +120,51 @@ Target RejectNegative Alias(misa=,sm_89)
march-map=sm_90a
Target RejectNegative Alias(misa=,sm_89)
+march-map=sm_100
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_100f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_100a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_101
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_101f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_101a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_103
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_103f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_103a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_120
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_120f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_120a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_121
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_121f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_121a
+Target RejectNegative Alias(misa=,sm_89)
+
Enum
Name(ptx_version) Type(enum ptx_version)
Known PTX ISA versions (for use with the -mptx= option):
diff --git a/gcc/config/pru/pru-pragma.cc b/gcc/config/pru/pru-pragma.cc
index c3f3d33..9338780 100644
--- a/gcc/config/pru/pru-pragma.cc
+++ b/gcc/config/pru/pru-pragma.cc
@@ -46,21 +46,24 @@ pru_pragma_ctable_entry (cpp_reader *)
enum cpp_ttype type;
type = pragma_lex (&ctable_index);
- if (type == CPP_NUMBER && tree_fits_uhwi_p (ctable_index))
+ if (type == CPP_NUMBER && tree_fits_shwi_p (ctable_index))
{
type = pragma_lex (&base_addr);
- if (type == CPP_NUMBER && tree_fits_uhwi_p (base_addr))
+ if (type == CPP_NUMBER && tree_fits_shwi_p (base_addr))
{
- unsigned HOST_WIDE_INT i = tree_to_uhwi (ctable_index);
- unsigned HOST_WIDE_INT base = tree_to_uhwi (base_addr);
+ HOST_WIDE_INT i = tree_to_shwi (ctable_index);
+ HOST_WIDE_INT base = sext_hwi (tree_to_shwi (base_addr),
+ POINTER_SIZE);
type = pragma_lex (&base_addr);
if (type != CPP_EOF)
error ("junk at end of %<#pragma CTABLE_ENTRY%>");
- else if (i >= ARRAY_SIZE (pru_ctable))
+ else if (!IN_RANGE (i, 0, ARRAY_SIZE (pru_ctable) - 1))
error ("%<CTABLE_ENTRY%> index %wd is not valid", i);
else if (pru_ctable[i].valid && pru_ctable[i].base != base)
error ("redefinition of %<CTABLE_ENTRY %wd%>", i);
+ else if (!IN_RANGE (base, INT32_MIN, INT32_MAX))
+ error ("%<CTABLE_ENTRY%> base address does not fit in 32-bits");
else
{
if (base & 0xff)
diff --git a/gcc/config/pru/pru-protos.h b/gcc/config/pru/pru-protos.h
index c73fad8..4750f0e 100644
--- a/gcc/config/pru/pru-protos.h
+++ b/gcc/config/pru/pru-protos.h
@@ -23,7 +23,7 @@
struct pru_ctable_entry {
bool valid;
- unsigned HOST_WIDE_INT base;
+ HOST_WIDE_INT base;
};
extern struct pru_ctable_entry pru_ctable[32];
@@ -66,9 +66,9 @@ pru_regno_ok_for_index_p (int regno, bool strict_p)
return pru_regno_ok_for_base_p (regno, strict_p);
}
-extern int pru_get_ctable_exact_base_index (unsigned HOST_WIDE_INT caddr);
-extern int pru_get_ctable_base_index (unsigned HOST_WIDE_INT caddr);
-extern int pru_get_ctable_base_offset (unsigned HOST_WIDE_INT caddr);
+extern int pru_get_ctable_exact_base_index (HOST_WIDE_INT caddr);
+extern int pru_get_ctable_base_index (HOST_WIDE_INT caddr);
+extern int pru_get_ctable_base_offset (HOST_WIDE_INT caddr);
extern int pru_symref2ioregno (rtx op);
diff --git a/gcc/config/pru/pru.cc b/gcc/config/pru/pru.cc
index 47e5f24..322e319 100644
--- a/gcc/config/pru/pru.cc
+++ b/gcc/config/pru/pru.cc
@@ -1428,7 +1428,7 @@ pru_valid_const_ubyte_offset (machine_mode mode, HOST_WIDE_INT offset)
/* Recognize a CTABLE base address. Return CTABLE entry index, or -1 if
base was not found in the pragma-filled pru_ctable. */
int
-pru_get_ctable_exact_base_index (unsigned HOST_WIDE_INT caddr)
+pru_get_ctable_exact_base_index (HOST_WIDE_INT caddr)
{
unsigned int i;
@@ -1444,7 +1444,7 @@ pru_get_ctable_exact_base_index (unsigned HOST_WIDE_INT caddr)
/* Check if the given address can be addressed via CTABLE_BASE + UBYTE_OFFS,
and return the base CTABLE index if possible. */
int
-pru_get_ctable_base_index (unsigned HOST_WIDE_INT caddr)
+pru_get_ctable_base_index (HOST_WIDE_INT caddr)
{
unsigned int i;
@@ -1461,7 +1461,7 @@ pru_get_ctable_base_index (unsigned HOST_WIDE_INT caddr)
/* Return the offset from some CTABLE base for this address. */
int
-pru_get_ctable_base_offset (unsigned HOST_WIDE_INT caddr)
+pru_get_ctable_base_offset (HOST_WIDE_INT caddr)
{
int i;
@@ -2004,7 +2004,7 @@ pru_print_operand_address (FILE *file, machine_mode mode, rtx op)
case CONST_INT:
{
- unsigned HOST_WIDE_INT caddr = INTVAL (op);
+ HOST_WIDE_INT caddr = INTVAL (op);
int base = pru_get_ctable_base_index (caddr);
int offs = pru_get_ctable_base_offset (caddr);
if (base < 0)
diff --git a/gcc/config/riscv/arch-canonicalize b/gcc/config/riscv/arch-canonicalize
index fd55255..34dad45 100755
--- a/gcc/config/riscv/arch-canonicalize
+++ b/gcc/config/riscv/arch-canonicalize
@@ -32,7 +32,7 @@ import itertools
from functools import reduce
SUPPORTED_ISA_SPEC = ["2.2", "20190608", "20191213"]
-CANONICAL_ORDER = "imafdgqlcbkjtpvn"
+CANONICAL_ORDER = "imafdqlcbkjtpvnh"
LONG_EXT_PREFIXES = ['z', 's', 'h', 'x']
#
diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index f372f0e..6531996 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1714,6 +1714,74 @@
}
[(set_attr "type" "vialu")])
+(define_insn_and_split "*<sat_op_v_vdup>_vx_<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand")
+ (if_then_else:V_VLSI
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "vector_mask_operand")
+ (match_operand 5 "vector_length_operand")
+ (match_operand 6 "const_int_operand")
+ (match_operand 7 "const_int_operand")
+ (match_operand 8 "const_int_operand")
+ (match_operand 9 "const_int_operand")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)
+ (reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE)
+ (unspec:V_VLSI
+ [(match_operand:V_VLSI 3 "register_operand")
+ (vec_duplicate:V_VLSI
+ (match_operand:<VEL> 4 "reg_or_int_operand"))] VSAT_VX_OP_V_VDUP)
+ (unspec:V_VLSI
+ [(match_operand:DI 2 "register_operand")] UNSPEC_VUNDEF)))]
+ "TARGET_VECTOR && can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+ {
+ int vxrm_val = INTVAL (operands[9]);
+ riscv_vector::expand_vx_binary_vxrm_vec_vec_dup (operands[0], operands[3],
+ operands[4],
+ <VSAT_VX_OP_V_VDUP>,
+ vxrm_val, <MODE>mode);
+
+ DONE;
+ }
+ [(set_attr "type" "vaalu")])
+
+(define_insn_and_split "*<sat_op_vdup_v>_vx_<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand")
+ (if_then_else:V_VLSI
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "vector_mask_operand")
+ (match_operand 5 "vector_length_operand")
+ (match_operand 6 "const_int_operand")
+ (match_operand 7 "const_int_operand")
+ (match_operand 8 "const_int_operand")
+ (match_operand 9 "const_int_operand")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)
+ (reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE)
+ (unspec:V_VLSI
+ [(vec_duplicate:V_VLSI
+ (match_operand:<VEL> 4 "reg_or_int_operand"))
+ (match_operand:V_VLSI 3 "register_operand")] VSAT_VX_OP_VDUP_V)
+ (unspec:V_VLSI
+ [(match_operand:DI 2 "register_operand")] UNSPEC_VUNDEF)))]
+ "TARGET_VECTOR && can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+ {
+ int vxrm_val = INTVAL (operands[9]);
+ riscv_vector::expand_vx_binary_vxrm_vec_dup_vec (operands[0], operands[3],
+ operands[4],
+ <VSAT_VX_OP_VDUP_V>,
+ vxrm_val, <MODE>mode);
+
+ DONE;
+ }
+ [(set_attr "type" "vaalu")])
+
;; =============================================================================
;; Combine vec_duplicate + op.vv to op.vf
;; Include
@@ -1838,8 +1906,58 @@
emit_insn (gen_extend<vsubel><vel>2(tmp, operands[1]));
rtx ops[] = {operands[0], tmp};
- riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::UNARY_OP, ops);
+ riscv_vector::expand_broadcast (<MODE>mode, ops);
+ DONE;
+ }
+ [(set_attr "type" "vfwmuladd")]
+)
+
+;; vfwnmacc.vf
+(define_insn_and_split "*vfwnmacc_vf_<mode>"
+ [(set (match_operand:VWEXTF 0 "register_operand")
+ (minus:VWEXTF
+ (mult:VWEXTF
+ (neg:VWEXTF
+ (vec_duplicate:VWEXTF
+ (float_extend:<VEL>
+ (match_operand:<VSUBEL> 2 "register_operand"))))
+ (float_extend:VWEXTF
+ (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand")))
+ (match_operand:VWEXTF 1 "register_operand")))]
+ "TARGET_VECTOR && can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+ {
+ rtx ops[] = {operands[0], operands[1], operands[2], operands[3]};
+ riscv_vector::emit_vlmax_insn(
+ code_for_pred_widen_mul_neg_scalar(MINUS, <MODE>mode),
+ riscv_vector::WIDEN_TERNARY_OP_FRM_DYN, ops);
+ DONE;
+ }
+ [(set_attr "type" "vfwmuladd")]
+)
+
+;; vfwnmsac.vf
+(define_insn_and_split "*vfwnmsac_vf_<mode>"
+ [(set (match_operand:VWEXTF 0 "register_operand")
+ (minus:VWEXTF
+ (match_operand:VWEXTF 1 "register_operand")
+ (mult:VWEXTF
+ (float_extend:VWEXTF
+ (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand"))
+ (vec_duplicate:VWEXTF
+ (float_extend:<VEL>
+ (match_operand:<VSUBEL> 2 "register_operand"))))))]
+ "TARGET_VECTOR && can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+ {
+ rtx ops[] = {operands[0], operands[1], operands[2], operands[3]};
+ riscv_vector::emit_vlmax_insn(
+ code_for_pred_widen_mul_neg_scalar (PLUS, <MODE>mode),
+ riscv_vector::WIDEN_TERNARY_OP_FRM_DYN, ops);
DONE;
}
[(set_attr "type" "vfwmuladd")]
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 2e86826..48de5ef 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1359,9 +1359,7 @@
if (operands[2] == const0_rtx)
{
rtx ops[] = {operands[0], operands[0], operands[1]};
- riscv_vector::emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::SCALAR_MOVE_MERGED_OP_TU,
- ops, CONST1_RTX (Pmode));
+ riscv_vector::expand_set_first_tu (<MODE>mode, ops);
}
else
{
@@ -1385,8 +1383,7 @@
VL we need for the slide. */
rtx tmp = gen_reg_rtx (<MODE>mode);
rtx ops1[] = {tmp, operands[1]};
- emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::UNARY_OP, ops1, length);
+ riscv_vector::expand_broadcast (<MODE>mode, ops1, length);
/* Slide exactly one element up leaving the tail elements
unchanged. */
@@ -2489,7 +2486,8 @@
(sign_extend:VWEXTI
(match_operand:<V_DOUBLE_TRUNC> 1 "register_operand"))
(sign_extend:VWEXTI
- (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand"))))))]
+ (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand")))
+ (const_int 1))))]
"TARGET_VECTOR"
{
insn_code icode = code_for_pred (UNSPEC_VAADD, <V_DOUBLE_TRUNC>mode);
@@ -2522,7 +2520,8 @@
(match_operand:<V_DOUBLE_TRUNC> 1 "register_operand"))
(sign_extend:VWEXTI
(match_operand:<V_DOUBLE_TRUNC> 2 "register_operand")))
- (const_int 1)))))]
+ (const_int 1))
+ (const_int 1))))]
"TARGET_VECTOR"
{
insn_code icode = code_for_pred (UNSPEC_VAADD, <V_DOUBLE_TRUNC>mode);
@@ -2532,6 +2531,19 @@
}
)
+(define_expand "avg<mode>3_ceil"
+ [(match_operand:V_VLSI_D 0 "register_operand")
+ (match_operand:V_VLSI_D 1 "register_operand")
+ (match_operand:V_VLSI_D 2 "register_operand")]
+ "TARGET_VECTOR"
+ {
+ insn_code icode = code_for_pred (UNSPEC_VAADD, <MODE>mode);
+ riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP_VXRM_RNU,
+ operands);
+ DONE;
+ }
+)
+
;; csrwi vxrm, 2
;; vaaddu.vv vd, vs2, vs1
(define_expand "uavg<mode>3_floor"
diff --git a/gcc/config/riscv/gen-riscv-mcpu-texi.cc b/gcc/config/riscv/gen-riscv-mcpu-texi.cc
new file mode 100644
index 0000000..9681438
--- /dev/null
+++ b/gcc/config/riscv/gen-riscv-mcpu-texi.cc
@@ -0,0 +1,43 @@
+#include <string>
+#include <vector>
+#include <stdio.h>
+
+int
+main ()
+{
+ puts ("@c Copyright (C) 2025 Free Software Foundation, Inc.");
+ puts ("@c This is part of the GCC manual.");
+ puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi.");
+ puts ("");
+ puts ("@c This file is generated automatically using");
+ puts ("@c gcc/config/riscv/gen-riscv-mcpu-texi.cc from:");
+ puts ("@c gcc/config/riscv/riscv-cores.def");
+ puts ("");
+ puts ("@c Please *DO NOT* edit manually.");
+ puts ("");
+ puts ("@samp{Core Name}");
+ puts ("");
+ puts ("@opindex mcpu");
+ puts ("@item -mcpu=@var{processor-string}");
+ puts ("Use architecture of and optimize the output for the given processor, specified");
+ puts ("by particular CPU name. Permissible values for this option are:");
+ puts ("");
+ puts ("");
+
+ std::vector<std::string> coreNames;
+
+#define RISCV_CORE(CORE_NAME, ARCH, MICRO_ARCH) \
+ coreNames.push_back (CORE_NAME);
+#include "riscv-cores.def"
+#undef RISCV_CORE
+
+ for (size_t i = 0; i < coreNames.size(); ++i) {
+ if (i == coreNames.size() - 1) {
+ printf("@samp{%s}.\n", coreNames[i].c_str());
+ } else {
+ printf("@samp{%s},\n\n", coreNames[i].c_str());
+ }
+ }
+
+ return 0;
+}
diff --git a/gcc/config/riscv/gen-riscv-mtune-texi.cc b/gcc/config/riscv/gen-riscv-mtune-texi.cc
new file mode 100644
index 0000000..1bdfe2a
--- /dev/null
+++ b/gcc/config/riscv/gen-riscv-mtune-texi.cc
@@ -0,0 +1,41 @@
+#include <string>
+#include <vector>
+#include <stdio.h>
+
+int
+main ()
+{
+ puts ("@c Copyright (C) 2025 Free Software Foundation, Inc.");
+ puts ("@c This is part of the GCC manual.");
+ puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi.");
+ puts ("");
+ puts ("@c This file is generated automatically using");
+ puts ("@c gcc/config/riscv/gen-riscv-mtune-texi.cc from:");
+ puts ("@c gcc/config/riscv/riscv-cores.def");
+ puts ("");
+ puts ("@c Please *DO NOT* edit manually.");
+ puts ("");
+ puts ("@samp{Tune Name}");
+ puts ("");
+ puts ("@opindex mtune");
+ puts ("@item -mtune=@var{processor-string}");
+ puts ("Optimize the output for the given processor, specified by microarchitecture or");
+ puts ("particular CPU name. Permissible values for this option are:");
+ puts ("");
+ puts ("");
+
+ std::vector<std::string> tuneNames;
+
+#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO) \
+ tuneNames.push_back (TUNE_NAME);
+#include "riscv-cores.def"
+#undef RISCV_TUNE
+
+ for (size_t i = 0; i < tuneNames.size(); ++i) {
+ printf("@samp{%s},\n\n", tuneNames[i].c_str());
+ }
+
+ puts ("and all valid options for @option{-mcpu=}.");
+
+ return 0;
+}
diff --git a/gcc/config/riscv/generic-vector-ooo.md b/gcc/config/riscv/generic-vector-ooo.md
index ab9e57f..773003b 100644
--- a/gcc/config/riscv/generic-vector-ooo.md
+++ b/gcc/config/riscv/generic-vector-ooo.md
@@ -17,6 +17,9 @@
;; <http://www.gnu.org/licenses/>.
;; Vector load/store
+;; The insn reservations include "generic" as we won't have a in-order
+;; generic definition for vector instructions.
+
(define_automaton "vector_ooo")
;; Separate issue queue for vector instructions.
@@ -29,119 +32,141 @@
(define_cpu_unit "vxu_ooo_multicycle" "vector_ooo")
(define_insn_reservation "vec_load" 6
- (eq_attr "type" "vlde,vldm,vlds,vldux,vldox,vldff,vldr")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vlde,vldm,vlds,vldux,vldox,vldff,vldr"))
"vxu_ooo_issue,vxu_ooo_alu")
(define_insn_reservation "vec_store" 6
- (eq_attr "type" "vste,vstm,vsts,vstux,vstox,vstr")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vste,vstm,vsts,vstux,vstox,vstr"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector segment loads/stores.
(define_insn_reservation "vec_loadstore_seg" 10
- (eq_attr "type" "vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\
- vssegte,vssegts,vssegtux,vssegtox")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\
+ vssegte,vssegts,vssegtux,vssegtox"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Regular vector operations and integer comparisons.
(define_insn_reservation "vec_alu" 3
- (eq_attr "type" "vialu,viwalu,vext,vicalu,vshift,vnshift,viminmax,vicmp,\
- vimov,vsalu,vaalu,vsshift,vnclip,vmov,vfmov,vector,\
- vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,vror,vwsll")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vialu,viwalu,vext,vicalu,vshift,vnshift,viminmax,vicmp,\
+ vimov,vsalu,vaalu,vsshift,vnclip,vmov,vfmov,vector,\
+ vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,vror,vwsll"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector float comparison, conversion etc.
(define_insn_reservation "vec_fcmp" 3
- (eq_attr "type" "vfrecp,vfminmax,vfcmp,vfsgnj,vfclass,vfcvtitof,\
- vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,\
- vfncvtftoi,vfncvtftof,vfncvtbf16,vfwcvtbf16")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfrecp,vfminmax,vfcmp,vfsgnj,vfclass,vfcvtitof,\
+ vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,\
+ vfncvtftoi,vfncvtftof,vfncvtbf16,vfwcvtbf16"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector integer multiplication.
(define_insn_reservation "vec_imul" 4
- (eq_attr "type" "vimul,viwmul,vimuladd,viwmuladd,vsmul,vclmul,vclmulh,\
- vghsh,vgmul")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vimul,viwmul,vimuladd,viwmuladd,vsmul,vclmul,vclmulh,\
+ vghsh,vgmul"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector float addition.
(define_insn_reservation "vec_fadd" 4
- (eq_attr "type" "vfalu,vfwalu")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfalu,vfwalu"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector float multiplication and FMA.
(define_insn_reservation "vec_fmul" 6
- (eq_attr "type" "vfmul,vfwmul,vfmuladd,vfwmuladd,vfwmaccbf16,sf_vqmacc,sf_vfnrclip")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfmul,vfwmul,vfmuladd,vfwmuladd,vfwmaccbf16,sf_vqmacc,sf_vfnrclip"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector crypto, assumed to be a generic operation for now.
(define_insn_reservation "vec_crypto" 4
- (eq_attr "type" "crypto,vclz,vctz,vcpop")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "crypto,vclz,vctz,vcpop"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector crypto, AES
(define_insn_reservation "vec_crypto_aes" 4
- (eq_attr "type" "vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector crypto, sha
(define_insn_reservation "vec_crypto_sha" 4
- (eq_attr "type" "vsha2ms,vsha2ch,vsha2cl")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vsha2ms,vsha2ch,vsha2cl"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector crypto, SM3/4
(define_insn_reservation "vec_crypto_sm" 4
- (eq_attr "type" "vsm4k,vsm4r,vsm3me,vsm3c")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vsm4k,vsm4r,vsm3me,vsm3c"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector permute.
(define_insn_reservation "vec_perm" 3
- (eq_attr "type" "vimerge,vfmerge,vslideup,vslidedown,vislide1up,\
- vislide1down,vfslide1up,vfslide1down,vgather,vcompress")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vimerge,vfmerge,vslideup,vslidedown,vislide1up,\
+ vislide1down,vfslide1up,vfslide1down,vgather,vcompress"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector reduction.
(define_insn_reservation "vec_reduction" 8
- (eq_attr "type" "vired,viwred,vfredu,vfwredu")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vired,viwred,vfredu,vfwredu"))
"vxu_ooo_issue,vxu_ooo_multicycle")
;; Vector ordered reduction, assume the latency number is for
;; a 128-bit vector. It is scaled in riscv_sched_adjust_cost
;; for larger vectors.
(define_insn_reservation "vec_ordered_reduction" 10
- (eq_attr "type" "vfredo,vfwredo")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfredo,vfwredo"))
"vxu_ooo_issue,vxu_ooo_multicycle*3")
;; Vector integer division, assume not pipelined.
(define_insn_reservation "vec_idiv" 16
- (eq_attr "type" "vidiv")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vidiv"))
"vxu_ooo_issue,vxu_ooo_multicycle*3")
;; Vector float divisions and sqrt, assume not pipelined.
(define_insn_reservation "vec_float_divsqrt" 16
- (eq_attr "type" "vfdiv,vfsqrt")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfdiv,vfsqrt"))
"vxu_ooo_issue,vxu_ooo_multicycle*3")
;; Vector mask operations.
(define_insn_reservation "vec_mask" 2
- (eq_attr "type" "vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,\
- vfmovvf,vfmovfv")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,\
+ vfmovvf,vfmovfv"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector vsetvl.
(define_insn_reservation "vec_vesetvl" 1
- (eq_attr "type" "vsetvl,vsetvl_pre")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vsetvl,vsetvl_pre"))
"vxu_ooo_issue")
;; Vector rounding mode setters, assume pipeline barrier.
(define_insn_reservation "vec_setrm" 20
- (eq_attr "type" "wrvxrm,wrfrm")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "wrvxrm,wrfrm"))
"vxu_ooo_issue,vxu_ooo_issue*3")
;; Vector read vlen/vlenb.
(define_insn_reservation "vec_readlen" 4
- (eq_attr "type" "rdvlenb,rdvl")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "rdvlenb,rdvl"))
"vxu_ooo_issue,vxu_ooo_issue")
;; Vector sf_vcp.
(define_insn_reservation "vec_sf_vcp" 2
- (eq_attr "type" "sf_vc,sf_vc_se")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "sf_vc,sf_vc_se"))
"vxu_ooo_issue")
diff --git a/gcc/config/riscv/mips-p8700.md b/gcc/config/riscv/mips-p8700.md
index ae0ea8d..fac9abb 100644
--- a/gcc/config/riscv/mips-p8700.md
+++ b/gcc/config/riscv/mips-p8700.md
@@ -163,5 +163,5 @@
vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll,
vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz,
vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16,
- sf_vc,sf_vc_se"))
+ sf_vc,sf_vc_se,ghost"))
"mips_p8700_dummies")
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 1f9a6b5..381f96c 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -518,6 +518,10 @@
(define_predicate "vector_broadcast_mask_operand"
(ior (match_operand 0 "vector_least_significant_set_mask_operand")
+ (match_operand 0 "vector_all_trues_mask_operand")))
+
+(define_predicate "strided_broadcast_mask_operand"
+ (ior (match_operand 0 "vector_least_significant_set_mask_operand")
(ior (match_operand 0 "register_operand")
(match_operand 0 "vector_all_trues_mask_operand"))))
@@ -619,6 +623,15 @@
(define_predicate "direct_broadcast_operand"
(match_test "riscv_vector::can_be_broadcast_p (op)"))
+;; A strided broadcast is just a fallback pattern that loads from
+;; memory.
+(define_predicate "strided_broadcast_operand"
+ (match_test "riscv_vector::strided_broadcast_p (op)"))
+
+(define_predicate "any_broadcast_operand"
+ (ior (match_operand 0 "direct_broadcast_operand")
+ (match_operand 0 "strided_broadcast_operand")))
+
;; A CONST_INT operand that has exactly two bits cleared.
(define_predicate "const_nottwobits_operand"
(and (match_code "const_int")
diff --git a/gcc/config/riscv/riscv-ext.def b/gcc/config/riscv/riscv-ext.def
index 6fc6d38..09f18ad 100644
--- a/gcc/config/riscv/riscv-ext.def
+++ b/gcc/config/riscv/riscv-ext.def
@@ -80,8 +80,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({}),
/* SUPPORTED_VERSIONS */ ({{2, 0}}),
/* FLAG_GROUP */ base,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 0,
+ /* BITMASK_BIT_POSITION*/ 4,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -190,8 +190,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({"zba", "zbb", "zbs"}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ base,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 0,
+ /* BITMASK_BIT_POSITION*/ 1,
/* EXTRA_EXTENSION_FLAGS */ EXT_FLAG_MACRO)
DEFINE_RISCV_EXT(
@@ -216,8 +216,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ base,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 0,
+ /* BITMASK_BIT_POSITION*/ 7,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -398,8 +398,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({}),
/* SUPPORTED_VERSIONS */ ({{2, 0}}),
/* FLAG_GROUP */ zi,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 1,
+ /* BITMASK_BIT_POSITION*/ 11,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -464,7 +464,7 @@ DEFINE_RISCV_EXT(
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ zi,
/* BITMASK_GROUP_ID */ 1,
- /* BITMASK_BIT_POSITION*/ 1,
+ /* BITMASK_BIT_POSITION*/ 8,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -476,8 +476,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ zm,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 1,
+ /* BITMASK_BIT_POSITION*/ 12,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -787,8 +787,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({"zca"}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ zc,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 1,
+ /* BITMASK_BIT_POSITION*/ 10,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -813,8 +813,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({"zca", "zilsd"}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ zc,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 1,
+ /* BITMASK_BIT_POSITION*/ 9,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index a41c4c2..539321f 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -414,8 +414,14 @@ enum insn_flags : unsigned int
/* Means INSN has VXRM operand and the value is VXRM_RNU. */
VXRM_RNU_P = 1 << 20,
+ /* Means INSN has VXRM operand and the value is VXRM_RNE. */
+ VXRM_RNE_P = 1 << 21,
+
/* Means INSN has VXRM operand and the value is VXRM_RDN. */
- VXRM_RDN_P = 1 << 21,
+ VXRM_RDN_P = 1 << 22,
+
+ /* Means INSN has VXRM operand and the value is VXRM_ROD. */
+ VXRM_ROD_P = 1 << 23,
};
enum insn_type : unsigned int
@@ -477,7 +483,9 @@ enum insn_type : unsigned int
BINARY_OP_TUMA = __MASK_OP_TUMA | BINARY_OP_P,
BINARY_OP_FRM_DYN = BINARY_OP | FRM_DYN_P,
BINARY_OP_VXRM_RNU = BINARY_OP | VXRM_RNU_P,
+ BINARY_OP_VXRM_RNE = BINARY_OP | VXRM_RNE_P,
BINARY_OP_VXRM_RDN = BINARY_OP | VXRM_RDN_P,
+ BINARY_OP_VXRM_ROD = BINARY_OP | VXRM_ROD_P,
/* Ternary operator. Always have real merge operand. */
TERNARY_OP = HAS_DEST_P | HAS_MASK_P | USE_ALL_TRUES_MASK_P | HAS_MERGE_P
@@ -672,6 +680,8 @@ void expand_vec_oct_sstrunc (rtx, rtx, machine_mode, machine_mode,
machine_mode);
void expand_vx_binary_vec_dup_vec (rtx, rtx, rtx, rtx_code, machine_mode);
void expand_vx_binary_vec_vec_dup (rtx, rtx, rtx, rtx_code, machine_mode);
+void expand_vx_binary_vxrm_vec_vec_dup (rtx, rtx, rtx, int, int, machine_mode);
+void expand_vx_binary_vxrm_vec_dup_vec (rtx, rtx, rtx, int, int, machine_mode);
#endif
bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
bool, void (*)(rtx *, rtx), enum avl_type);
@@ -695,6 +705,9 @@ bool expand_block_move (rtx, rtx, rtx, bool);
machine_mode preferred_simd_mode (scalar_mode);
machine_mode get_mask_mode (machine_mode);
void expand_vec_series (rtx, rtx, rtx, rtx = 0);
+void expand_broadcast (machine_mode, rtx *, rtx = 0);
+void expand_set_first (machine_mode, rtx *, rtx = 0);
+void expand_set_first_tu (machine_mode, rtx *, rtx = 0);
void expand_vec_init (rtx, rtx);
void expand_vec_perm (rtx, rtx, rtx, rtx);
void expand_select_vl (rtx *);
@@ -762,6 +775,7 @@ enum vlmul_type get_vlmul (rtx_insn *);
int count_regno_occurrences (rtx_insn *, unsigned int);
bool imm_avl_p (machine_mode);
bool can_be_broadcast_p (rtx);
+bool strided_broadcast_p (rtx);
bool gather_scatter_valid_offset_p (machine_mode);
HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 9080189..61c4a09 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -1625,16 +1625,14 @@ expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in)
Otherwise, use a predicated store. */
if (known_eq (GET_MODE_SIZE (info.vmode), INTVAL (info.avl)))
{
- emit_vlmax_insn (code_for_pred_broadcast (info.vmode), UNARY_OP,
- broadcast_ops);
+ riscv_vector::expand_broadcast (info.vmode, broadcast_ops);
emit_move_insn (dst, fill_value);
}
else
{
if (!satisfies_constraint_vl (info.avl))
info.avl = force_reg (Pmode, info.avl);
- emit_nonvlmax_insn (code_for_pred_broadcast (info.vmode),
- riscv_vector::UNARY_OP, broadcast_ops, info.avl);
+ riscv_vector::expand_broadcast (info.vmode, broadcast_ops, info.avl);
machine_mode mask_mode
= riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (info.vmode))
.require ();
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 242ac08..c9c8328 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -351,9 +351,12 @@ public:
add_rounding_mode_operand (FRM_RNE);
else if (m_insn_flags & VXRM_RNU_P)
add_rounding_mode_operand (VXRM_RNU);
+ else if (m_insn_flags & VXRM_RNE_P)
+ add_rounding_mode_operand (VXRM_RNE);
else if (m_insn_flags & VXRM_RDN_P)
add_rounding_mode_operand (VXRM_RDN);
-
+ else if (m_insn_flags & VXRM_ROD_P)
+ add_rounding_mode_operand (VXRM_ROD);
if (insn_data[(int) icode].n_operands != m_opno)
internal_error ("invalid number of operands for insn %s, "
@@ -1190,6 +1193,59 @@ expand_vector_init_trailing_same_elem (rtx target,
return false;
}
+/* Helper function to emit a vmv.vx/vi and float variants.
+ If VL is not given a VLMAX insn will be emitted, otherwise
+ a non-VLMAX insn with length VL.
+ If the value to be broadcast is not suitable for vmv.vx
+ fall back to a vlse with zero stride. This itself has a
+ fallback if the uarch prefers not to use a strided load
+ for broadcast. */
+
+void
+expand_broadcast (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[1];
+ avl_type type = vl ? NONVLMAX : VLMAX;
+ if (can_be_broadcast_p (elt))
+ emit_avltype_insn (code_for_pred_broadcast (mode), UNARY_OP, ops,
+ type, vl);
+ else
+ emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+ UNARY_OP, ops, type, vl);
+}
+
+/* Similar to expand_broadcast but emits a vmv.s.x/vfmv.s.f instead. */
+
+void
+expand_set_first (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[1];
+ avl_type type = vl ? NONVLMAX : VLMAX;
+ if (can_be_broadcast_p (elt))
+ emit_avltype_insn (code_for_pred_broadcast (mode),
+ SCALAR_MOVE_OP, ops, type, vl);
+ else
+ emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+ SCALAR_MOVE_OP, ops, type, vl);
+}
+
+/* Similar to expand_set_first but keeping the tail elements
+ unchanged (TU) */
+
+void
+expand_set_first_tu (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[2];
+ if (!vl)
+ vl = const1_rtx;
+ if (can_be_broadcast_p (elt))
+ emit_nonvlmax_insn (code_for_pred_broadcast (mode),
+ SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+ else
+ emit_nonvlmax_insn (code_for_pred_strided_broadcast (mode),
+ SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+}
+
static void
expand_const_vec_duplicate (rtx target, rtx src, rtx elt)
{
@@ -1226,7 +1282,7 @@ expand_const_vec_duplicate (rtx target, rtx src, rtx elt)
if (lra_in_progress)
{
rtx ops[] = {result, elt};
- emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+ expand_broadcast (mode, ops);
}
else
{
@@ -1278,8 +1334,7 @@ expand_const_vector_duplicate_repeating (rtx target, rvv_builder *builder)
{
dup = gen_reg_rtx (builder->new_mode ());
rtx ops[] = {dup, ele};
- emit_vlmax_insn (code_for_pred_broadcast (builder->new_mode ()),
- UNARY_OP, ops);
+ expand_broadcast (builder->new_mode (), ops);
}
else
dup = expand_vector_broadcast (builder->new_mode (), ele);
@@ -1322,8 +1377,7 @@ expand_const_vector_duplicate_default (rtx target, rvv_builder *builder)
rtx tmp1 = gen_reg_rtx (builder->mode ());
rtx dup_ops[] = {tmp1, builder->elt (0)};
- emit_vlmax_insn (code_for_pred_broadcast (builder->mode ()), UNARY_OP,
- dup_ops);
+ expand_broadcast (builder->mode (), dup_ops);
for (unsigned int i = 1; i < builder->npatterns (); i++)
{
@@ -2136,18 +2190,32 @@ has_vi_variant_p (rtx_code code, rtx x)
}
}
+/* This is a helper for binary ops with DImode scalar operands that are
+ broadcast (like vadd.vx v1, a1).
+ Instead of having similar code for all the expanders this function
+ unifies the handling. For 64-bit targets all we do is choose
+ between the vi variant (if available) and the register variant.
+ For 32-bit targets we either create the sign-extending variant
+ of vop.vx (when the immediate fits 32 bits) or emit a vector
+ broadcast of the 64-bit register/immediate and switch to a
+ vop.vv (replacing the scalar op with the broadcast vector. */
+
bool
sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
machine_mode vector_mode, bool has_vi_variant_p,
void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
{
machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
+
+ /* If the scalar broadcast op fits an immediate, use the
+ vop.vi variant if there is one. */
if (has_vi_variant_p)
{
*scalar_op = force_reg (scalar_mode, *scalar_op);
return false;
}
+ /* On a 64-bit target we can always use the vop.vx variant. */
if (TARGET_64BIT)
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
@@ -2155,6 +2223,8 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
return false;
}
+ /* For 32 bit and if there is no vop.vi variant for a 32-bit immediate
+ we need to use the sign-extending (SI -> DI) vop.vx variants. */
if (immediate_operand (*scalar_op, Pmode))
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
@@ -2164,40 +2234,29 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
return false;
}
- bool avoid_strided_broadcast = false;
+ /* Now we're left with a 64-bit immediate or a register.
+ We cannot use a vop.vx variant but must broadcast the value first
+ and switch to a vop.vv variant.
+ Broadcast can either be done via vlse64.v v1, reg, zero
+ or by loading one 64-bit element (vle64.v) and using a
+ broadcast vrgather.vi. This is decided when splitting
+ the strided broadcast insn. */
+ gcc_assert (!TARGET_64BIT
+ && (CONST_INT_P (*scalar_op)
+ || register_operand (*scalar_op, scalar_mode)));
+
if (CONST_INT_P (*scalar_op))
{
if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
- {
- if (strided_load_broadcast_p ())
- *scalar_op = force_const_mem (scalar_mode, *scalar_op);
- else
- avoid_strided_broadcast = true;
- }
+ *scalar_op = force_const_mem (scalar_mode, *scalar_op);
else
*scalar_op = force_reg (scalar_mode, *scalar_op);
}
rtx tmp = gen_reg_rtx (vector_mode);
- if (!avoid_strided_broadcast)
- {
- rtx ops[] = {tmp, *scalar_op};
- emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
- type, vl);
- }
- else
- {
- /* Load scalar as V1DI and broadcast via vrgather.vi. */
- rtx tmp1 = gen_reg_rtx (V1DImode);
- emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op,
- scalar_mode));
- tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode);
-
- rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)};
- emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode),
- BINARY_OP, ops);
- }
-
+ rtx ops[] = {tmp, *scalar_op};
+ emit_avltype_insn (code_for_pred_strided_broadcast (vector_mode),
+ UNARY_OP, ops, type, vl);
emit_vector_func (operands, tmp);
return true;
@@ -2591,8 +2650,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
/* Step 1: Broadcast the first pattern. */
rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
- emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
- UNARY_OP, ops);
+ expand_broadcast (builder.mode (), ops);
/* Step 2: Merge the rest iteration of pattern. */
for (unsigned int i = 1; i < builder.npatterns (); i++)
{
@@ -2605,8 +2663,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */
{
rtx ops[] = {dup, merge_mask};
- emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
- SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
+ expand_set_first (GET_MODE (dup), ops);
}
else /* vmv.v.x. */
{
@@ -2614,8 +2671,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
Pmode);
- emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
- ops, vl);
+ expand_broadcast (mask_int_mode, ops, vl);
}
emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
@@ -4706,20 +4762,20 @@ expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe,
rtx m1_tmp = gen_reg_rtx (m1_mode);
rtx scalar_move_ops[] = {m1_tmp, init};
- insn_code icode = code_for_pred_broadcast (m1_mode);
if (need_mask_operand_p (insn_flags))
{
if (need_vl0_safe)
- emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx);
+ expand_set_first (m1_mode, scalar_move_ops, const1_rtx);
else
- emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op);
+ expand_set_first (m1_mode, scalar_move_ops, vl_op);
}
else
- emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
+ expand_set_first (m1_mode, scalar_move_ops);
rtx m1_tmp2 = gen_reg_rtx (m1_mode);
rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
+ insn_code icode;
if (need_vl0_safe)
icode = code_for_pred (unspec_for_vl0_safe, vmode);
else
@@ -5597,6 +5653,82 @@ expand_vx_binary_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2,
emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops);
}
+static enum insn_type
+get_insn_type_by_vxrm_val (int vxrm_val)
+{
+ enum insn_type itype;
+
+ switch (vxrm_val)
+ {
+ case VXRM_RNU:
+ itype = BINARY_OP_VXRM_RNU;
+ break;
+ case VXRM_RNE:
+ itype = BINARY_OP_VXRM_RNE;
+ break;
+ case VXRM_RDN:
+ itype = BINARY_OP_VXRM_RDN;
+ break;
+ case VXRM_ROD:
+ itype = BINARY_OP_VXRM_ROD;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ return itype;
+}
+
+/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x))
+ and its' vxrm value. Aka the second op comes from the vec_duplicate,
+ and the first op is the vector reg. */
+
+void
+expand_vx_binary_vxrm_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2, int unspec,
+ int vxrm_val, machine_mode mode)
+{
+ enum insn_code icode;
+ enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val);
+ rtx ops[] = {op_0, op_1, op_2};
+
+ switch (unspec)
+ {
+ case UNSPEC_VAADD:
+ case UNSPEC_VAADDU:
+ icode = code_for_pred_scalar (unspec, mode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_vlmax_insn (icode, itype, ops);
+}
+
+/* Expand the binary vx combine with the format like v2 = vop(vec_dup(x), v1)
+ and its' vxrm value. Aka the second op comes from the vec_duplicate,
+ and the first op is the vector reg. */
+
+void
+expand_vx_binary_vxrm_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, int unspec,
+ int vxrm_val, machine_mode mode)
+{
+ enum insn_code icode;
+ enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val);
+ rtx ops[] = {op_0, op_1, op_2};
+
+ switch (unspec)
+ {
+ case UNSPEC_VAADD:
+ case UNSPEC_VAADDU:
+ icode = code_for_pred_scalar (unspec, mode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_vlmax_insn (icode, itype, ops);
+}
+
/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)).
Aka the second op comes from the vec_duplicate, and the first op is
the vector reg. */
@@ -5808,25 +5940,84 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
return count;
}
-/* Return true if the OP can be directly broadcast. */
+/* Return true if the OP can be broadcast with a
+ v[f]mv.v.[xif] instruction. */
+
bool
can_be_broadcast_p (rtx op)
{
machine_mode mode = GET_MODE (op);
- /* We don't allow RA (register allocation) reload generate
- (vec_duplicate:DI reg) in RV32 system wheras we allow
- (vec_duplicate:DI mem) in RV32 system. */
- if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
- && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
- && !satisfies_constraint_Wdm (op))
+
+ /* Zero always works and we can always put an immediate into a
+ register.
+ What's tricky is that for an immediate we don't know the
+ register's mode it will end up in, i.e. what element size
+ we want to broadcast. So even if the immediate is small it might
+ still end up in a DImode register that we cannot broadcast.
+ vmv.s.x, i.e. a single-element set can handle this, though,
+ because it implicitly sign-extends to SEW. */
+ if (rtx_equal_p (op, CONST0_RTX (mode))
+ || const_int_operand (op, Xmode))
+ return true;
+
+ /* Do not accept DImode broadcasts on !TARGET_64BIT. Those
+ are handled by strided broadcast. */
+ if (INTEGRAL_MODE_P (mode)
+ && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
+ return false;
+
+ /* Non-register operands that can be forced into a register we can
+ handle. These don't need to use strided broadcast. */
+ if (INTEGRAL_MODE_P (mode)
+ && (memory_operand (op, mode) || CONST_POLY_INT_P (op))
+ && can_create_pseudo_p ())
+ return true;
+
+ /* Likewise, do not accept HFmode broadcast if we don't have
+ vfmv.v.f for 16-bit registers available. */
+ if (mode == HFmode && !TARGET_ZVFH)
+ return false;
+
+ /* Same for float, just that we can always handle 64-bit doubles
+ even on !TARGET_64BIT. We have ruled out 16-bit HF already
+ above. */
+ if (FLOAT_MODE_P (mode)
+ && (memory_operand (op, mode) || CONSTANT_P (op))
+ && can_create_pseudo_p ())
+ return true;
+
+ /* After excluding all the cases we cannot handle the register types
+ that remain can always be broadcast. */
+ if (register_operand (op, mode))
+ return true;
+
+ return false;
+}
+
+/* Returns true for all operands that cannot use vmv.vx, vfmv.vf,
+ vmv.s.x, or vfmv.s.f but rather need to go via memory. */
+
+bool
+strided_broadcast_p (rtx op)
+{
+ machine_mode mode = GET_MODE (op);
+ if (!memory_operand (op, mode)
+ && !register_operand (op, mode)
+ && !rtx_equal_p (op, CONST0_RTX (mode))
+ && !const_int_operand (op, mode))
return false;
- if (satisfies_constraint_K (op) || register_operand (op, mode)
- || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op))
- || rtx_equal_p (op, CONST0_RTX (mode)))
+ /* !TARGET64_BIT does not have a vmv.v.x/vmv.s.x for 64-bit
+ DImode elements. */
+ if (INTEGRAL_MODE_P (mode)
+ && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
+ return true;
+
+ /* Zvfhmin does not have a vfmv.v.f/vfmv.s.f. for 16-bit elements. */
+ if (!TARGET_ZVFH && mode == HFmode)
return true;
- return can_create_pseudo_p () && nonmemory_operand (op, mode);
+ return false;
}
void
@@ -5941,7 +6132,10 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
return false;
}
-/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */
+/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.
+ That's the case if we're dealing with a scalar broadcast that
+ has VL = 1. */
+
bool
splat_to_scalar_move_p (rtx *ops)
{
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index bf5172c..7e4d396 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -643,7 +643,8 @@ public:
return e.use_exact_insn (code_for_pred_mov (e.vector_mode ()));
case OP_TYPE_x:
case OP_TYPE_f:
- return e.use_exact_insn (code_for_pred_broadcast (e.vector_mode ()));
+ return e.use_scalar_broadcast_insn
+ (code_for_pred_broadcast (e.vector_mode ()));
default:
gcc_unreachable ();
}
diff --git a/gcc/config/riscv/riscv-vector-builtins.cc b/gcc/config/riscv/riscv-vector-builtins.cc
index 8810af0..0db7549 100644
--- a/gcc/config/riscv/riscv-vector-builtins.cc
+++ b/gcc/config/riscv/riscv-vector-builtins.cc
@@ -4753,7 +4753,10 @@ function_expander::use_ternop_insn (bool vd_accum_p, insn_code icode)
}
/* Implement the call using instruction ICODE, with a 1:1 mapping between
- arguments and input operands. */
+ arguments and input operands.
+ There are operands that cannot be broadcast using v[f]mv. In that case
+ we switch to a strided broadcast. */
+
rtx
function_expander::use_widen_ternop_insn (insn_code icode)
{
@@ -4794,7 +4797,10 @@ function_expander::use_widen_ternop_insn (insn_code icode)
}
/* Implement the call using instruction ICODE, with a 1:1 mapping between
- arguments and input operands. */
+ arguments and input operands.
+ There are operands that cannot be broadcast using v[f]mv. In that case
+ we switch to a strided broadcast. */
+
rtx
function_expander::use_scalar_move_insn (insn_code icode)
{
@@ -4812,6 +4818,37 @@ function_expander::use_scalar_move_insn (insn_code icode)
for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
add_input_operand (argno);
+ if (!can_be_broadcast_p (m_ops[3].value))
+ icode = code_for_pred_strided_broadcast (vector_mode ());
+
+ add_input_operand (Pmode, get_tail_policy_for_pred (pred));
+ add_input_operand (Pmode, get_mask_policy_for_pred (pred));
+ add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
+ return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, with a 1:1 mapping between
+ arguments and input operands. */
+rtx
+function_expander::use_scalar_broadcast_insn (insn_code icode)
+{
+ machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+
+ /* Record the offset to get the argument. */
+ int arg_offset = 0;
+ add_all_one_mask_operand (mask_mode ());
+
+ if (use_real_merge_p (pred))
+ add_input_operand (arg_offset++);
+ else
+ add_vundef_operand (mode);
+
+ for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
+ add_input_operand (argno);
+
+ if (!can_be_broadcast_p (m_ops[3].value))
+ icode = code_for_pred_strided_broadcast (vector_mode ());
+
add_input_operand (Pmode, get_tail_policy_for_pred (pred));
add_input_operand (Pmode, get_mask_policy_for_pred (pred));
add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
diff --git a/gcc/config/riscv/riscv-vector-builtins.h b/gcc/config/riscv/riscv-vector-builtins.h
index 1f2587a..86d8115 100644
--- a/gcc/config/riscv/riscv-vector-builtins.h
+++ b/gcc/config/riscv/riscv-vector-builtins.h
@@ -497,6 +497,7 @@ public:
rtx use_ternop_insn (bool, insn_code);
rtx use_widen_ternop_insn (insn_code);
rtx use_scalar_move_insn (insn_code);
+ rtx use_scalar_broadcast_insn (insn_code);
rtx generate_insn (insn_code);
/* The function call expression. */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 4d8170d..44ef44a 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -178,8 +178,8 @@ get_live_range (hash_map<tree, pair> *live_ranges, tree arg)
STMT 5 (be vectorized) -- point 2
...
*/
-static void
-compute_local_program_points (
+void
+costs::compute_local_program_points (
vec_info *vinfo,
hash_map<basic_block, vec<stmt_point>> &program_points_per_bb)
{
@@ -274,14 +274,14 @@ loop_invariant_op_p (class loop *loop,
/* Return true if the variable should be counted into liveness. */
static bool
-variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var,
- bool lhs_p)
+variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info,
+ slp_tree node, tree var, bool lhs_p)
{
if (!var)
return false;
gimple *stmt = STMT_VINFO_STMT (stmt_info);
- enum stmt_vec_info_type type
- = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+ stmt_info = vect_stmt_to_vectorize (stmt_info);
+ enum stmt_vec_info_type type = SLP_TREE_TYPE (node);
if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
{
if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE
@@ -357,8 +357,8 @@ variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var,
The live range of SSA 1 is [1, 3] in bb 2.
The live range of SSA 2 is [0, 4] in bb 3. */
-static machine_mode
-compute_local_live_ranges (
+machine_mode
+costs::compute_local_live_ranges (
loop_vec_info loop_vinfo,
const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb)
@@ -388,8 +388,11 @@ compute_local_live_ranges (
unsigned int point = program_point.point;
gimple *stmt = program_point.stmt;
tree lhs = gimple_get_lhs (stmt);
- if (variable_vectorized_p (loop, program_point.stmt_info, lhs,
- true))
+ slp_tree *node = vinfo_slp_map.get (program_point.stmt_info);
+ if (!node)
+ continue;
+ if (variable_vectorized_p (loop, program_point.stmt_info,
+ *node, lhs, true))
{
biggest_mode = get_biggest_mode (biggest_mode,
TYPE_MODE (TREE_TYPE (lhs)));
@@ -397,7 +400,7 @@ compute_local_live_ranges (
pair &live_range
= live_ranges->get_or_insert (lhs, &existed_p);
gcc_assert (!existed_p);
- if (STMT_VINFO_MEMORY_ACCESS_TYPE (program_point.stmt_info)
+ if (SLP_TREE_MEMORY_ACCESS_TYPE (*node)
== VMAT_LOAD_STORE_LANES)
point = get_first_lane_point (program_points,
program_point.stmt_info);
@@ -406,8 +409,8 @@ compute_local_live_ranges (
for (i = 0; i < gimple_num_args (stmt); i++)
{
tree var = gimple_arg (stmt, i);
- if (variable_vectorized_p (loop, program_point.stmt_info, var,
- false))
+ if (variable_vectorized_p (loop, program_point.stmt_info,
+ *node, var, false))
{
biggest_mode
= get_biggest_mode (biggest_mode,
@@ -415,8 +418,7 @@ compute_local_live_ranges (
bool existed_p = false;
pair &live_range
= live_ranges->get_or_insert (var, &existed_p);
- if (STMT_VINFO_MEMORY_ACCESS_TYPE (
- program_point.stmt_info)
+ if (SLP_TREE_MEMORY_ACCESS_TYPE (*node)
== VMAT_LOAD_STORE_LANES)
point = get_last_lane_point (program_points,
program_point.stmt_info);
@@ -597,15 +599,15 @@ get_store_value (gimple *stmt)
}
/* Return true if additional vector vars needed. */
-static bool
-need_additional_vector_vars_p (stmt_vec_info stmt_info)
+bool
+costs::need_additional_vector_vars_p (stmt_vec_info stmt_info,
+ slp_tree node)
{
- enum stmt_vec_info_type type
- = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+ enum stmt_vec_info_type type = SLP_TREE_TYPE (node);
if (type == load_vec_info_type || type == store_vec_info_type)
{
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
- && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
return true;
machine_mode mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
@@ -657,8 +659,8 @@ compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode)
Then, after this function, we update SSA 1 live range in bb 2
into [2, 4] since SSA 1 is live out into bb 3. */
-static void
-update_local_live_ranges (
+void
+costs::update_local_live_ranges (
vec_info *vinfo,
hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb,
@@ -685,8 +687,13 @@ update_local_live_ranges (
{
gphi *phi = psi.phi ();
stmt_vec_info stmt_info = vinfo->lookup_stmt (phi);
- if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
- == undef_vec_info_type)
+ stmt_info = vect_stmt_to_vectorize (stmt_info);
+ slp_tree *node = vinfo_slp_map.get (stmt_info);
+
+ if (!node)
+ continue;
+
+ if (SLP_TREE_TYPE (*node) == undef_vec_info_type)
continue;
for (j = 0; j < gimple_phi_num_args (phi); j++)
@@ -761,9 +768,12 @@ update_local_live_ranges (
if (!is_gimple_assign_or_call (gsi_stmt (si)))
continue;
stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
- enum stmt_vec_info_type type
- = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
- if (need_additional_vector_vars_p (stmt_info))
+ stmt_info = vect_stmt_to_vectorize (stmt_info);
+ slp_tree *node = vinfo_slp_map.get (stmt_info);
+ if (!node)
+ continue;
+ enum stmt_vec_info_type type = SLP_TREE_TYPE (*node);
+ if (need_additional_vector_vars_p (stmt_info, *node))
{
/* For non-adjacent load/store STMT, we will potentially
convert it into:
@@ -816,8 +826,8 @@ update_local_live_ranges (
}
/* Compute the maximum live V_REGS. */
-static bool
-has_unexpected_spills_p (loop_vec_info loop_vinfo)
+bool
+costs::has_unexpected_spills_p (loop_vec_info loop_vinfo)
{
/* Compute local program points.
It's a fast and effective computation. */
@@ -899,7 +909,11 @@ costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
/* Detect whether we're vectorizing for VLA and should apply the unrolling
heuristic described above m_unrolled_vls_niters. */
record_potential_vls_unrolling (loop_vinfo);
+}
+void
+costs::record_lmul_spills (loop_vec_info loop_vinfo)
+{
/* Detect whether the LOOP has unexpected spills. */
record_potential_unexpected_spills (loop_vinfo);
}
@@ -1071,7 +1085,7 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
load/store. */
static int
segment_loadstore_group_size (enum vect_cost_for_stmt kind,
- stmt_vec_info stmt_info)
+ stmt_vec_info stmt_info, slp_tree node)
{
if (stmt_info
&& (kind == vector_load || kind == vector_store)
@@ -1079,7 +1093,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind,
{
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
if (stmt_info
- && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+ && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES)
return DR_GROUP_SIZE (stmt_info);
}
return 0;
@@ -1093,7 +1107,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind,
unsigned
costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
stmt_vec_info stmt_info,
- slp_tree, tree vectype, int stmt_cost)
+ slp_tree node, tree vectype, int stmt_cost)
{
const cpu_vector_cost *costs = get_vector_costs ();
switch (kind)
@@ -1116,7 +1130,8 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
each vector in the group. Here we additionally add permute
costs for each. */
/* TODO: Indexed and ordered/unordered cost. */
- int group_size = segment_loadstore_group_size (kind, stmt_info);
+ int group_size = segment_loadstore_group_size (kind, stmt_info,
+ node);
if (group_size > 1)
{
switch (group_size)
@@ -1239,8 +1254,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
int stmt_cost
= targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign);
+ if (stmt_info && node)
+ vinfo_slp_map.put (stmt_info, node);
+
/* Do one-time initialization based on the vinfo. */
loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
+
if (!m_analyzed_vinfo)
{
if (loop_vinfo)
@@ -1326,6 +1345,8 @@ costs::finish_cost (const vector_costs *scalar_costs)
{
if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
{
+ record_lmul_spills (loop_vinfo);
+
adjust_vect_cost_per_loop (loop_vinfo);
}
vector_costs::finish_cost (scalar_costs);
diff --git a/gcc/config/riscv/riscv-vector-costs.h b/gcc/config/riscv/riscv-vector-costs.h
index de546a6..b84ceb1 100644
--- a/gcc/config/riscv/riscv-vector-costs.h
+++ b/gcc/config/riscv/riscv-vector-costs.h
@@ -91,7 +91,10 @@ private:
typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
hash_set <tree_pair_hash> memrefs;
+ hash_map <stmt_vec_info, slp_tree> vinfo_slp_map;
+
void analyze_loop_vinfo (loop_vec_info);
+ void record_lmul_spills (loop_vec_info loop_vinfo);
void record_potential_vls_unrolling (loop_vec_info);
bool prefer_unrolled_loop () const;
@@ -103,6 +106,19 @@ private:
bool m_has_unexpected_spills_p = false;
void record_potential_unexpected_spills (loop_vec_info);
+ void compute_local_program_points (vec_info *,
+ hash_map<basic_block, vec<stmt_point>> &);
+ void update_local_live_ranges (vec_info *,
+ hash_map<basic_block, vec<stmt_point>> &,
+ hash_map<basic_block, hash_map<tree, pair>> &,
+ machine_mode *);
+ machine_mode compute_local_live_ranges
+ (loop_vec_info, const hash_map<basic_block, vec<stmt_point>> &,
+ hash_map<basic_block, hash_map<tree, pair>> &);
+
+ bool has_unexpected_spills_p (loop_vec_info);
+ bool need_additional_vector_vars_p (stmt_vec_info, slp_tree);
+
void adjust_vect_cost_per_loop (loop_vec_info);
unsigned adjust_stmt_cost (enum vect_cost_for_stmt kind,
loop_vec_info,
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 1275b03..0a9fcef 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -170,7 +170,7 @@ struct GTY(()) riscv_frame_info {
};
enum riscv_privilege_levels {
- UNKNOWN_MODE, USER_MODE, SUPERVISOR_MODE, MACHINE_MODE
+ UNKNOWN_MODE, SUPERVISOR_MODE, MACHINE_MODE, RNMI_MODE
};
struct GTY(()) mode_switching_info {
@@ -3967,13 +3967,27 @@ get_vector_binary_rtx_cost (rtx x, int scalar2vr_cost)
{
gcc_assert (riscv_v_ext_mode_p (GET_MODE (x)));
- rtx op_0 = XEXP (x, 0);
- rtx op_1 = XEXP (x, 1);
+ rtx neg;
+ rtx op_0;
+ rtx op_1;
+
+ if (GET_CODE (x) == UNSPEC)
+ {
+ op_0 = XVECEXP (x, 0, 0);
+ op_1 = XVECEXP (x, 0, 1);
+ }
+ else
+ {
+ op_0 = XEXP (x, 0);
+ op_1 = XEXP (x, 1);
+ }
if (GET_CODE (op_0) == VEC_DUPLICATE
|| GET_CODE (op_1) == VEC_DUPLICATE)
return (scalar2vr_cost + 1) * COSTS_N_INSNS (1);
- else if (GET_CODE (op_0) == NEG && GET_CODE (op_1) == VEC_DUPLICATE)
+ else if (GET_CODE (neg = op_0) == NEG
+ && (GET_CODE (op_1) == VEC_DUPLICATE
+ || GET_CODE (XEXP (neg, 0)) == VEC_DUPLICATE))
return (scalar2vr_cost + 1) * COSTS_N_INSNS (1);
else
return COSTS_N_INSNS (1);
@@ -4021,6 +4035,21 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
case SS_MINUS:
*total = get_vector_binary_rtx_cost (op, scalar2vr_cost);
break;
+ case UNSPEC:
+ {
+ switch (XINT (op, 1))
+ {
+ case UNSPEC_VAADDU:
+ case UNSPEC_VAADD:
+ *total
+ = get_vector_binary_rtx_cost (op, scalar2vr_cost);
+ break;
+ default:
+ *total = COSTS_N_INSNS (1);
+ break;
+ }
+ }
+ break;
default:
*total = COSTS_N_INSNS (1);
break;
@@ -6896,12 +6925,18 @@ riscv_handle_type_attribute (tree *node ATTRIBUTE_UNUSED, tree name, tree args,
}
string = TREE_STRING_POINTER (cst);
- if (strcmp (string, "user") && strcmp (string, "supervisor")
- && strcmp (string, "machine"))
+ if (!strcmp (string, "rnmi") && !TARGET_SMRNMI)
+ {
+ error ("attribute 'rnmi' requires the Smrnmi ISA extension");
+ *no_add_attrs = true;
+ }
+ else if (strcmp (string, "supervisor")
+ && strcmp (string, "machine")
+ && strcmp (string, "rnmi"))
{
warning (OPT_Wattributes,
- "argument to %qE attribute is not %<\"user\"%>, %<\"supervisor\"%>, "
- "or %<\"machine\"%>", name);
+ "argument to %qE attribute is not %<\"supervisor\"%>, "
+ "%<\"machine\"%>, or %<\"rnmi\"%>", name);
*no_add_attrs = true;
}
}
@@ -9049,7 +9084,7 @@ riscv_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size)
/* We want the CFA independent of the stack pointer for the
duration of the loop. */
add_reg_note (insn, REG_CFA_DEF_CFA,
- plus_constant (Pmode, temp1,
+ plus_constant (Pmode, temp2,
initial_cfa_offset + rounded_size));
RTX_FRAME_RELATED_P (insn) = 1;
}
@@ -9682,12 +9717,12 @@ riscv_expand_epilogue (int style)
if (th_int_mask && TH_INT_INTERRUPT (cfun))
emit_jump_insn (gen_th_int_pop ());
- else if (mode == MACHINE_MODE)
- emit_jump_insn (gen_riscv_mret ());
else if (mode == SUPERVISOR_MODE)
emit_jump_insn (gen_riscv_sret ());
- else
- emit_jump_insn (gen_riscv_uret ());
+ else if (mode == RNMI_MODE)
+ emit_jump_insn (gen_riscv_mnret ());
+ else /* Must be MACHINE_MODE. */
+ emit_jump_insn (gen_riscv_mret ());
}
else if (style != SIBCALL_RETURN)
{
@@ -10359,10 +10394,10 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
bool sched1 = can_create_pseudo_p ();
- unsigned int prev_dest_regno = (REG_P (SET_DEST (prev_set))
+ unsigned int prev_dest_regno = (prev_set && REG_P (SET_DEST (prev_set))
? REGNO (SET_DEST (prev_set))
: FIRST_PSEUDO_REGISTER);
- unsigned int curr_dest_regno = (REG_P (SET_DEST (curr_set))
+ unsigned int curr_dest_regno = (curr_set && REG_P (SET_DEST (curr_set))
? REGNO (SET_DEST (curr_set))
: FIRST_PSEUDO_REGISTER);
@@ -12029,10 +12064,10 @@ riscv_get_interrupt_type (tree decl)
{
const char *string = TREE_STRING_POINTER (TREE_VALUE (attr_args));
- if (!strcmp (string, "user"))
- return USER_MODE;
- else if (!strcmp (string, "supervisor"))
+ if (!strcmp (string, "supervisor"))
return SUPERVISOR_MODE;
+ else if (!strcmp (string, "rnmi"))
+ return RNMI_MODE;
else /* Must be "machine". */
return MACHINE_MODE;
}
@@ -12649,14 +12684,31 @@ riscv_estimated_poly_value (poly_int64 val,
/* Return true if the vector misalignment factor is supported by the
target. */
bool
-riscv_support_vector_misalignment (machine_mode mode,
- const_tree type ATTRIBUTE_UNUSED,
- int misalignment,
- bool is_packed ATTRIBUTE_UNUSED)
+riscv_support_vector_misalignment (machine_mode mode, const_tree type,
+ int misalignment, bool is_packed,
+ bool is_gather_scatter)
{
- /* Depend on movmisalign pattern. */
+ /* IS_PACKED is true if the corresponding scalar element is not naturally
+ aligned. If the misalignment is unknown and the the access is packed
+ we defer to the default hook which will check if movmisalign is present.
+ Movmisalign, in turn, depends on TARGET_VECTOR_MISALIGN_SUPPORTED. */
+ if (misalignment == DR_MISALIGNMENT_UNKNOWN)
+ {
+ if (!is_packed)
+ return true;
+ }
+ else
+ {
+ /* If we know that misalignment is a multiple of the element size, we're
+ good. */
+ if (misalignment % TYPE_ALIGN_UNIT (type) == 0)
+ return true;
+ }
+
+ /* Otherwise fall back to movmisalign again. */
return default_builtin_support_vector_misalignment (mode, type, misalignment,
- is_packed);
+ is_packed,
+ is_gather_scatter);
}
/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index c3b504d..578dd43 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -120,7 +120,7 @@
;; Interrupt handler instructions.
UNSPECV_MRET
UNSPECV_SRET
- UNSPECV_URET
+ UNSPECV_MNRET
;; Blockage and synchronization.
UNSPECV_BLOCKAGE
@@ -4166,11 +4166,11 @@
"sret"
[(set_attr "type" "ret")])
-(define_insn "riscv_uret"
+(define_insn "riscv_mnret"
[(return)
- (unspec_volatile [(const_int 0)] UNSPECV_URET)]
- ""
- "uret"
+ (unspec_volatile [(const_int 0)] UNSPECV_MNRET)]
+ "TARGET_SMRNMI"
+ "mnret"
[(set_attr "type" "ret")])
(define_insn "stack_tie<mode>"
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 7aac56a..a7eaa8b 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -229,8 +229,41 @@ s-riscv-ext.texi: build/gen-riscv-ext-texi$(build_exeext)
$(SHELL) $(srcdir)/../move-if-change tmp-riscv-ext.texi $(srcdir)/doc/riscv-ext.texi
$(STAMP) s-riscv-ext.texi
-# Run `riscv-regen' after you changed or added anything from riscv-ext*.def
+RISCV_CORES_DEFS = \
+ $(srcdir)/config/riscv/riscv-cores.def
+
+build/gen-riscv-mtune-texi.o: $(srcdir)/config/riscv/gen-riscv-mtune-texi.cc \
+ $(RISCV_CORES_DEFS)
+ $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@
+
+build/gen-riscv-mcpu-texi.o: $(srcdir)/config/riscv/gen-riscv-mcpu-texi.cc \
+ $(RISCV_CORES_DEFS)
+ $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@
+
+build/gen-riscv-mtune-texi$(build_exeext): build/gen-riscv-mtune-texi.o
+ $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $<
+
+build/gen-riscv-mcpu-texi$(build_exeext): build/gen-riscv-mcpu-texi.o
+ $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $<
+
+$(srcdir)/doc/riscv-mtune.texi: $(RISCV_CORES_DEFS)
+$(srcdir)/doc/riscv-mtune.texi: s-riscv-mtune.texi ; @true
+
+$(srcdir)/doc/riscv-mcpu.texi: $(RISCV_CORES_DEFS)
+$(srcdir)/doc/riscv-mcpu.texi: s-riscv-mcpu.texi ; @true
+
+s-riscv-mtune.texi: build/gen-riscv-mtune-texi$(build_exeext)
+ $(RUN_GEN) build/gen-riscv-mtune-texi$(build_exeext) > tmp-riscv-mtune.texi
+ $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mtune.texi $(srcdir)/doc/riscv-mtune.texi
+ $(STAMP) s-riscv-mtune.texi
+
+s-riscv-mcpu.texi: build/gen-riscv-mcpu-texi$(build_exeext)
+ $(RUN_GEN) build/gen-riscv-mcpu-texi$(build_exeext) > tmp-riscv-mcpu.texi
+ $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mcpu.texi $(srcdir)/doc/riscv-mcpu.texi
+ $(STAMP) s-riscv-mcpu.texi
+
+# Run `riscv-regen' after you changed or added anything from riscv-ext*.def and riscv-cores*.def
.PHONY: riscv-regen
-riscv-regen: s-riscv-ext.texi s-riscv-ext.opt
+riscv-regen: s-riscv-ext.texi s-riscv-ext.opt s-riscv-mtune.texi s-riscv-mcpu.texi
diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md
index 5f6cc42..aa3b6fb 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -4013,6 +4013,14 @@
UNSPEC_VASUBU UNSPEC_VASUB UNSPEC_VSMUL
UNSPEC_VSSRL UNSPEC_VSSRA])
+(define_int_iterator VSAT_VX_OP_V_VDUP [
+ UNSPEC_VAADDU UNSPEC_VAADD
+])
+
+(define_int_iterator VSAT_VX_OP_VDUP_V [
+ UNSPEC_VAADDU UNSPEC_VAADD
+])
+
(define_int_iterator VSAT_ARITH_OP [UNSPEC_VAADDU UNSPEC_VAADD
UNSPEC_VASUBU UNSPEC_VASUB UNSPEC_VSMUL])
(define_int_iterator VSAT_SHIFT_OP [UNSPEC_VSSRL UNSPEC_VSSRA])
@@ -4047,6 +4055,14 @@
(UNSPEC_VSSRA "vsshift") (UNSPEC_VNCLIP "vnclip")
(UNSPEC_VNCLIPU "vnclip")])
+(define_int_attr sat_op_v_vdup [
+ (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd")
+])
+
+(define_int_attr sat_op_vdup_v [
+ (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd")
+])
+
(define_int_attr misc_op [(UNSPEC_VMSBF "sbf") (UNSPEC_VMSIF "sif") (UNSPEC_VMSOF "sof")
(UNSPEC_VFRSQRT7 "rsqrt7")])
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index baf215b..66b7670 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1551,20 +1551,44 @@
(define_expand "vec_duplicate<mode>"
[(set (match_operand:V_VLS 0 "register_operand")
(vec_duplicate:V_VLS
- (match_operand:<VEL> 1 "direct_broadcast_operand")))]
+ (match_operand:<VEL> 1 "any_broadcast_operand")))]
"TARGET_VECTOR"
{
- /* Early expand DImode broadcast in RV32 system to avoid RA reload
- generate (set (reg) (vec_duplicate:DI)). */
+ /* Don't keep a DImode broadcast for RV32 in the vec_duplicate form.
+ Otherwise combine or late combine could end up doing
+ "64-bit broadcast" (!= vmv.v.x)
+ + vadd.vv
+ = vadd.vx
+ which would be invalid. */
bool gt_p = maybe_gt (GET_MODE_SIZE (<VEL>mode), GET_MODE_SIZE (Pmode));
if (!FLOAT_MODE_P (<VEL>mode) && gt_p)
{
- riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::UNARY_OP, operands);
- DONE;
+ riscv_vector::emit_vlmax_insn
+ (code_for_pred_strided_broadcast
+ (<MODE>mode), riscv_vector::UNARY_OP, operands);
+ DONE;
}
- /* Otherwise, allow it fall into general vec_duplicate pattern
- which allow us to have vv->vx combine optimization in later pass. */
+
+ /* Even though we can eventually broadcast any permissible
+ constant by moving it into a register we need to force
+ any non-immediate one into a register here.
+ If we didn't do that we couldn't fwprop/late-combine
+ vec_duplicate 123.45f
+ + vfadd.vv
+ = vfadd.vf
+ because the constant is valid for vec_duplicate but not
+ for vfadd.vf. Therefore we need to do
+ fa0 = 123.45f
+ vec_duplicate fa0
+ + vfadd.vv
+ = vfadd.vf */
+ if (!satisfies_constraint_P (operands[1])
+ && !satisfies_constraint_J (operands[1])
+ && !rtx_equal_p (operands[1], CONST0_RTX (<VEL>mode))
+ && !memory_operand (operands[1], <VEL>mode))
+ operands[1] = force_reg (<VEL>mode, operands[1]);
+
+ /* Otherwise keep the vec_duplicate pattern until split. */
})
;; According to GCC internal:
@@ -1574,28 +1598,20 @@
(define_insn_and_split "*vec_duplicate<mode>"
[(set (match_operand:V_VLS 0 "register_operand")
(vec_duplicate:V_VLS
- (match_operand:<VEL> 1 "direct_broadcast_operand")))]
+ (match_operand:<VEL> 1 "any_broadcast_operand")))]
"TARGET_VECTOR && can_create_pseudo_p ()"
"#"
"&& 1"
[(const_int 0)]
{
- if (!strided_load_broadcast_p ()
- && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode)
- {
- /* For Float16, reinterpret as HImode, broadcast and reinterpret
- back. */
- poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
- machine_mode vmodehi
- = riscv_vector::get_vector_mode (HImode, nunits).require ();
- rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode),
- lowpart_subreg (HImode, operands[1], HFmode)};
- riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi),
- riscv_vector::UNARY_OP, ops);
- }
- else
+ if (riscv_vector::can_be_broadcast_p (operands[1]))
riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
riscv_vector::UNARY_OP, operands);
+ else
+ riscv_vector::emit_vlmax_insn (code_for_pred_strided_broadcast
+ (<MODE>mode), riscv_vector::UNARY_OP,
+ operands);
+
DONE;
}
[(set_attr "type" "vector")]
@@ -2141,69 +2157,45 @@
(match_operand:V_VLS 2 "vector_merge_operand")))]
"TARGET_VECTOR"
{
- /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f
- has better chances to do vsetvl fusion in vsetvl pass. */
bool wrap_vec_dup = true;
rtx vec_cst = NULL_RTX;
- if (riscv_vector::splat_to_scalar_move_p (operands))
- {
- operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
- operands[3] = force_reg (<VEL>mode, operands[3]);
- }
- else if (immediate_operand (operands[3], <VEL>mode)
- && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3]))
- && (/* -> pred_broadcast<mode>_zero */
- (vector_least_significant_set_mask_operand (operands[1],
- <VM>mode)
- && vector_const_0_operand (vec_cst, <MODE>mode))
- || (/* pred_broadcast<mode>_imm */
- vector_all_trues_mask_operand (operands[1], <VM>mode)
- && vector_const_int_or_double_0_operand (vec_cst,
- <MODE>mode))))
+ if (immediate_operand (operands[3], <VEL>mode)
+ && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3]))
+ && (/* -> pred_broadcast<mode>_zero */
+ (vector_least_significant_set_mask_operand (operands[1],
+ <VM>mode)
+ && vector_const_0_operand (vec_cst, <MODE>mode))
+ || (/* pred_broadcast<mode>_imm */
+ vector_all_trues_mask_operand (operands[1], <VM>mode)
+ && vector_const_int_or_double_0_operand (vec_cst,
+ <MODE>mode))))
{
operands[3] = vec_cst;
wrap_vec_dup = false;
}
- /* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar. */
- else if (satisfies_constraint_Wdm (operands[3]))
- {
- if (satisfies_constraint_Wb1 (operands[1]))
- {
- /* Case 1: vmv.s.x (TA, x == memory) ==> vlse.v (TA) */
- if (satisfies_constraint_vu (operands[2]))
- operands[1] = CONSTM1_RTX (<VM>mode);
- else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
- {
- /* Case 2: vmv.s.x (TU, x == memory) ==>
- vl = 0 or 1; + vlse.v (TU) in RV32 system */
- operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
- operands[1] = CONSTM1_RTX (<VM>mode);
- }
- else
- /* Case 3: load x (memory) to register. */
- operands[3] = force_reg (<VEL>mode, operands[3]);
- }
- }
- else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
- && (immediate_operand (operands[3], Pmode)
+ else if (GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD
+ && satisfies_constraint_Wb1 (operands[1])
+ && (immediate_operand (operands[3], Xmode)
|| (CONST_POLY_INT_P (operands[3])
&& known_ge (rtx_to_poly_int64 (operands[3]), 0U)
- && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode)))))
+ && known_le (rtx_to_poly_int64 (operands[3]),
+ GET_MODE_SIZE (<MODE>mode)))))
{
rtx tmp = gen_reg_rtx (Pmode);
poly_int64 value = rtx_to_poly_int64 (operands[3]);
- emit_move_insn (tmp, gen_int_mode (value, Pmode));
+ emit_move_insn (tmp, gen_int_mode (value, Xmode));
operands[3] = gen_rtx_SIGN_EXTEND (<VEL>mode, tmp);
}
- /* Never load (const_int 0) into a register, that's silly. */
- else if (operands[3] == CONST0_RTX (<VEL>mode))
+
+ /* For a vmv.v.x never load (const_int 0) or valid immediate operands
+ into a register, because we can use vmv.v.i. */
+ else if (satisfies_constraint_Wc1 (operands[1])
+ && (satisfies_constraint_P (operands[3])
+ || operands[3] == CONST0_RTX (<VEL>mode)))
;
- /* If we're broadcasting [-16..15] across more than just
- element 0, then we can use vmv.v.i directly, thus avoiding
- the load of the constant into a GPR. */
- else if (CONST_INT_P (operands[3])
- && IN_RANGE (INTVAL (operands[3]), -16, 15)
- && !satisfies_constraint_Wb1 (operands[1]))
+ /* For vmv.s.x we have vmv.s.x v1, zero. */
+ else if (satisfies_constraint_Wb1 (operands[1])
+ && operands[3] == CONST0_RTX (<VEL>mode))
;
else
operands[3] = force_reg (<VEL>mode, operands[3]);
@@ -2211,131 +2203,68 @@
operands[3] = gen_rtx_VEC_DUPLICATE (<MODE>mode, operands[3]);
})
-(define_insn_and_split "*pred_broadcast<mode>"
- [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vd, vd, vr, vr, vr, vr")
+(define_insn_and_rewrite "*pred_broadcast<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vr, vr")
(if_then_else:V_VLSI
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1, vm, vm,Wc1,Wc1,Wb1,Wb1")
- (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl,rvl,rvl,rvl,rvl")
- (match_operand 5 "const_int_operand" " i, i, i, i, i, i, i, i")
- (match_operand 6 "const_int_operand" " i, i, i, i, i, i, i, i")
- (match_operand 7 "const_int_operand" " i, i, i, i, i, i, i, i")
+ [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1")
+ (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(vec_duplicate:V_VLSI
- (match_operand:<VEL> 3 "direct_broadcast_operand" "rP,rP,Wdm,Wdm,Wdm,Wdm, rJ, rJ"))
- (match_operand:V_VLSI 2 "vector_merge_operand" "vu, 0, vu, 0, vu, 0, vu, 0")))]
+ (match_operand:<VEL> 3 "direct_broadcast_operand" " rP, rP, rJ, rJ"))
+ (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))]
"TARGET_VECTOR"
"@
vmv.v.%o3\t%0,%3
vmv.v.%o3\t%0,%3
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero
- vlse<sew>.v\t%0,%3,zero
vmv.s.x\t%0,%z3
vmv.s.x\t%0,%z3"
- "(register_operand (operands[3], <VEL>mode)
- || CONST_POLY_INT_P (operands[3]))
- && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)"
- [(const_int 0)]
- {
- gcc_assert (can_create_pseudo_p ());
- if (CONST_POLY_INT_P (operands[3]))
- {
- rtx tmp = gen_reg_rtx (<VEL>mode);
- emit_move_insn (tmp, operands[3]);
- operands[3] = tmp;
- }
-
- /* For SEW = 64 in RV32 system, we expand vmv.s.x:
- andi a2,a2,1
- vsetvl zero,a2,e64
- vlse64.v */
- if (satisfies_constraint_Wb1 (operands[1]))
- {
- operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
- operands[1] = CONSTM1_RTX (<VM>mode);
- }
-
- /* If the target doesn't want a strided-load broadcast we go with a regular
- V1DImode load and a broadcast gather. */
- if (strided_load_broadcast_p ())
- {
- rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
- GET_MODE_ALIGNMENT (<VEL>mode));
- mem = validize_mem (mem);
- emit_move_insn (mem, operands[3]);
- mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0)));
-
- emit_insn
- (gen_pred_broadcast<mode>
- (operands[0], operands[1], operands[2], mem,
- operands[4], operands[5], operands[6], operands[7]));
- }
- else
- {
- rtx tmp = gen_reg_rtx (V1DImode);
- emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3],
- <VEL>mode));
- tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
-
- emit_insn
- (gen_pred_gather<mode>_scalar
- (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
- operands[4], operands[5], operands[6], operands[7]));
- }
- DONE;
- }
- [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv")
+ "&& (operands[1] == CONSTM1_RTX (<VM>mode)
+ && operands[4] == CONST1_RTX (Pmode)
+ && (register_operand (operands[3], <VEL>mode)
+ || satisfies_constraint_J (operands[3])))"
+{
+ /* A broadcast of a single element is just a vmv.s.x. */
+ operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
+}
+ [(set_attr "type" "vimov,vimov,vimovxv,vimovxv")
(set_attr "mode" "<MODE>")])
-(define_insn "*pred_broadcast<mode>_zvfh"
- [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr")
+(define_insn_and_rewrite "pred_broadcast<mode>_zvfh"
+ [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr")
(if_then_else:V_VLSF
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1, Wc1, Wb1, Wb1")
- (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl")
- (match_operand 5 "const_int_operand" " i, i, i, i")
- (match_operand 6 "const_int_operand" " i, i, i, i")
- (match_operand 7 "const_int_operand" " i, i, i, i")
+ [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1")
+ (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(vec_duplicate:V_VLSF
- (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f"))
- (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f"))
+ (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))]
"TARGET_VECTOR"
"@
vfmv.v.f\t%0,%3
vfmv.v.f\t%0,%3
vfmv.s.f\t%0,%3
vfmv.s.f\t%0,%3"
+ "&& (operands[1] == CONSTM1_RTX (<VM>mode)
+ && operands[4] == CONST1_RTX (Pmode)
+ && (register_operand (operands[3], <VEL>mode)
+ || satisfies_constraint_J (operands[3])))"
+{
+ /* A broadcast of a single element is just a vfmv.s.f. */
+ operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
+}
[(set_attr "type" "vfmov,vfmov,vfmovfv,vfmovfv")
(set_attr "mode" "<MODE>")])
-(define_insn "*pred_broadcast<mode>_zvfhmin"
- [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr")
- (if_then_else:V_VLSF_ZVFHMIN
- (unspec:<VM>
- [(match_operand:<VM> 1 "vector_broadcast_mask_operand" " vm, vm, Wc1, Wc1")
- (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl")
- (match_operand 5 "const_int_operand" " i, i, i, i")
- (match_operand 6 "const_int_operand" " i, i, i, i")
- (match_operand 7 "const_int_operand" " i, i, i, i")
- (reg:SI VL_REGNUM)
- (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (vec_duplicate:V_VLSF_ZVFHMIN
- (match_operand:<VEL> 3 "direct_broadcast_operand" " A, A, A, A"))
- (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))]
- "TARGET_VECTOR && strided_load_broadcast_p ()"
- "@
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero
- vlse<sew>.v\t%0,%3,zero"
- [(set_attr "type" "vlds,vlds,vlds,vlds")
- (set_attr "mode" "<MODE>")])
-
(define_insn "*pred_broadcast<mode>_extended_scalar"
[(set (match_operand:V_VLSI_D 0 "register_operand" "=vr, vr, vr, vr")
(if_then_else:V_VLSI_D
@@ -2398,6 +2327,117 @@
[(set_attr "type" "vimov,vimov")
(set_attr "mode" "<MODE>")])
+(define_expand "@pred_strided_broadcast<mode>"
+ [(set (match_operand:V_VLS 0 "register_operand")
+ (if_then_else:V_VLS
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "strided_broadcast_mask_operand")
+ (match_operand 4 "vector_length_operand")
+ (match_operand 5 "const_int_operand")
+ (match_operand 6 "const_int_operand")
+ (match_operand 7 "const_int_operand")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (vec_duplicate:V_VLS
+ (match_operand:<VEL> 3 "strided_broadcast_operand"))
+ (match_operand:V_VLS 2 "vector_merge_operand")))]
+ "TARGET_VECTOR"
+{
+ if (satisfies_constraint_Wb1 (operands[1]))
+ {
+ /* If we're asked to set a single element (like vmv.s.x but we
+ need to go via memory here) and the tail policy is agnostic
+ we can overwrite all elements.
+ Thus, set the mask to broadcast. */
+ operands[1] = CONSTM1_RTX (<VM>mode);
+ if (!satisfies_constraint_vu (operands[2])
+ && GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD)
+ {
+ /* Case 2: vmv.s.x (TU, x == memory) ==>
+ vl = 0 or 1; + vlse.v (TU) in RV32 system */
+ /* In this case we must not overwrite the residual elements,
+ so set the vector length to 0/1. */
+ operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
+ }
+ }
+})
+
+(define_insn_and_split "*pred_strided_broadcast<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand" "=vd, vd, vr, vr")
+ (if_then_else:V_VLSI
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm,Wc1,Wc1")
+ (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (vec_duplicate:V_VLSI
+ (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A"))
+ (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ "TARGET_VECTOR"
+ "@
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero
+ vlse<sew>.v\t%0,%3,zero"
+ "&& !strided_load_broadcast_p () && can_create_pseudo_p ()"
+ [(const_int 0)]
+ {
+ rtx tmp = gen_reg_rtx (V1DImode);
+ emit_move_insn (tmp, gen_lowpart (V1DImode, operands[3]));
+ tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
+
+ emit_insn
+ (gen_pred_gather<mode>_scalar
+ (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
+ operands[4], operands[5], operands[6], operands[7]));
+ DONE;
+ }
+ [(set_attr "type" "vlds,vlds,vlds,vlds")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*pred_strided_broadcast<mode>_zvfhmin"
+ [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr")
+ (if_then_else:V_VLSF_ZVFHMIN
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm, Wc1, Wc1")
+ (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (vec_duplicate:V_VLSF_ZVFHMIN
+ (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A"))
+ (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ "TARGET_VECTOR"
+ "@
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero
+ vlse<sew>.v\t%0,%3,zero"
+ "&& !strided_load_broadcast_p ()
+ && <VEL>mode == HFmode
+ && can_create_pseudo_p ()"
+ [(const_int 0)]
+ {
+ poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
+ machine_mode vmodehi
+ = riscv_vector::get_vector_mode (HImode, nunits).require ();
+ rtx ops[] = {gen_lowpart (vmodehi, operands[0]),
+ gen_lowpart (HImode, operands[3])};
+ riscv_vector::emit_avltype_insn (code_for_pred_broadcast (vmodehi),
+ riscv_vector::UNARY_OP, ops,
+ (riscv_vector::avl_type) INTVAL (operands[7]),
+ operands[4]);
+ DONE;
+ }
+ [(set_attr "type" "vlds,vlds,vlds,vlds")
+ (set_attr "mode" "<MODE>")])
+
+
;; -------------------------------------------------------------------------------
;; ---- Predicated Strided loads/stores
;; -------------------------------------------------------------------------------
@@ -4639,8 +4679,8 @@
;; Handle GET_MODE_INNER (mode) = DImode. We need to split them since
;; we need to deal with SEW = 64 in RV32 system.
(define_expand "@pred_<sat_op><mode>_scalar"
- [(set (match_operand:VI_D 0 "register_operand")
- (if_then_else:VI_D
+ [(set (match_operand:V_VLSI_D 0 "register_operand")
+ (if_then_else:V_VLSI_D
(unspec:<VM>
[(match_operand:<VM> 1 "vector_mask_operand")
(match_operand 5 "vector_length_operand")
@@ -4651,10 +4691,10 @@
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)
(reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE)
- (unspec:VI_D
- [(match_operand:VI_D 3 "register_operand")
+ (unspec:V_VLSI_D
+ [(match_operand:V_VLSI_D 3 "register_operand")
(match_operand:<VEL> 4 "reg_or_int_operand")] VSAT_ARITH_OP)
- (match_operand:VI_D 2 "vector_merge_operand")))]
+ (match_operand:V_VLSI_D 2 "vector_merge_operand")))]
"TARGET_VECTOR"
{
if (riscv_vector::sew64_scalar_helper (
@@ -4673,8 +4713,8 @@
})
(define_insn "*pred_<sat_op><mode>_scalar"
- [(set (match_operand:VI_D 0 "register_operand" "=vd, vr, vd, vr")
- (if_then_else:VI_D
+ [(set (match_operand:V_VLSI_D 0 "register_operand" "=vd, vr, vd, vr")
+ (if_then_else:V_VLSI_D
(unspec:<VM>
[(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
(match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl")
@@ -4685,18 +4725,18 @@
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)
(reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE)
- (unspec:VI_D
- [(match_operand:VI_D 3 "register_operand" " vr, vr, vr, vr")
+ (unspec:V_VLSI_D
+ [(match_operand:V_VLSI_D 3 "register_operand" " vr, vr, vr, vr")
(match_operand:<VEL> 4 "reg_or_0_operand" " rJ, rJ, rJ, rJ")] VSAT_ARITH_OP)
- (match_operand:VI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ (match_operand:V_VLSI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))]
"TARGET_VECTOR"
"v<sat_op>.vx\t%0,%3,%z4%p1"
[(set_attr "type" "<sat_insn_type>")
(set_attr "mode" "<MODE>")])
(define_insn "*pred_<sat_op><mode>_extended_scalar"
- [(set (match_operand:VI_D 0 "register_operand" "=vd, vr, vd, vr")
- (if_then_else:VI_D
+ [(set (match_operand:V_VLSI_D 0 "register_operand" "=vd, vr, vd, vr")
+ (if_then_else:V_VLSI_D
(unspec:<VM>
[(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
(match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl")
@@ -4707,11 +4747,11 @@
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)
(reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE)
- (unspec:VI_D
- [(match_operand:VI_D 3 "register_operand" " vr, vr, vr, vr")
+ (unspec:V_VLSI_D
+ [(match_operand:V_VLSI_D 3 "register_operand" " vr, vr, vr, vr")
(sign_extend:<VEL>
(match_operand:<VSUBEL> 4 "reg_or_0_operand" " rJ, rJ, rJ, rJ"))] VSAT_ARITH_OP)
- (match_operand:VI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ (match_operand:V_VLSI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))]
"TARGET_VECTOR && !TARGET_64BIT"
"v<sat_op>.vx\t%0,%3,%z4%p1"
[(set_attr "type" "<sat_insn_type>")
diff --git a/gcc/config/riscv/xiangshan.md b/gcc/config/riscv/xiangshan.md
index 5ed6bac..34b4a8f 100644
--- a/gcc/config/riscv/xiangshan.md
+++ b/gcc/config/riscv/xiangshan.md
@@ -107,7 +107,8 @@
;; they are just dummies like this one.
(define_insn_reservation "xiangshan_alu_unknown" 1
(and (eq_attr "tune" "xiangshan")
- (eq_attr "type" "zicond,min,max,minu,maxu,clz,ctz,cpop,ghost,rotate,clmul,condmove,crypto,mvpair,rdvlenb,rdvl,wrvxrm,wrfrm,rdfrm,vsetvl,vsetvl_pre,vlde,vste,vldm,vstm,vlds,vsts,vldux,vldox,vstux,vstox,vldff,vldr,vstr,vlsegde,vssegte,vlsegds,vssegts,vlsegdux,vlsegdox,vssegtux,vssegtox,vlsegdff,vialu,viwalu,vext,vicalu,vshift,vnshift,vicmp,viminmax,vimul,vidiv,viwmul,vimuladd,sf_vqmacc,viwmuladd,vimerge,vimov,vsalu,vaalu,vsmul,vsshift,vnclip,sf_vfnrclip,vfalu,vfwalu,vfmul,vfdiv,vfwmul,vfmuladd,vfwmuladd,vfsqrt,vfrecp,vfcmp,vfminmax,vfsgnj,vfclass,vfmerge,vfmov,vfcvtitof,vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,vfncvtftoi,vfncvtftof,vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,vfmovvf,vfmovfv,vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16"))
+ (eq_attr "type" "zicond,min,max,minu,maxu,clz,ctz,cpop,ghost,rotate,clmul,condmove,crypto,mvpair,rdvlenb,rdvl,wrvxrm,wrfrm,rdfrm,vsetvl,vsetvl_pre,vlde,vste,vldm,vstm,vlds,vsts,vldux,vldox,vstux,vstox,vldff,vldr,vstr,vlsegde,vssegte,vlsegds,vssegts,vlsegdux,vlsegdox,vssegtux,vssegtox,vlsegdff,vialu,viwalu,vext,vicalu,vshift,vnshift,vicmp,viminmax,vimul,vidiv,viwmul,vimuladd,sf_vqmacc,viwmuladd,vimerge,vimov,vsalu,vaalu,vsmul,vsshift,vnclip,sf_vfnrclip,vfalu,vfwalu,vfmul,vfdiv,vfwmul,vfmuladd,vfwmuladd,vfsqrt,vfrecp,vfcmp,vfminmax,vfsgnj,vfclass,vfmerge,vfmov,vfcvtitof,vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,vfncvtftoi,vfncvtftof,vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,vfmovvf,vfmovfv,vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16,sf_vc,sf_vc_se"))
+
"xs_alu_rs")
;; ----------------------------------------------------
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 7ee26e5..764b499 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -4951,10 +4951,19 @@ static bool
rs6000_builtin_support_vector_misalignment (machine_mode mode,
const_tree type,
int misalignment,
- bool is_packed)
+ bool is_packed,
+ bool is_gather_scatter)
{
if (TARGET_VSX)
{
+ if (is_gather_scatter)
+ {
+ if (TARGET_ALTIVEC && is_packed)
+ return false;
+ else
+ return true;
+ }
+
if (TARGET_EFFICIENT_UNALIGNED_VSX)
return true;
@@ -5165,6 +5174,7 @@ public:
protected:
void update_target_cost_per_stmt (vect_cost_for_stmt, stmt_vec_info,
+ slp_tree node,
vect_cost_model_location, unsigned int);
void density_test (loop_vec_info);
void adjust_vect_cost_per_loop (loop_vec_info);
@@ -5312,6 +5322,7 @@ rs6000_adjust_vect_cost_per_stmt (enum vect_cost_for_stmt kind,
void
rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
stmt_vec_info stmt_info,
+ slp_tree node,
vect_cost_model_location where,
unsigned int orig_count)
{
@@ -5372,12 +5383,12 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
or may not need to apply. When finalizing the cost of the loop,
the extra penalty is applied when the load density heuristics
are satisfied. */
- if (kind == vec_construct && stmt_info
- && STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
- && (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
- || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_STRIDED_SLP))
+ if (kind == vec_construct && node
+ && SLP_TREE_TYPE (node) == load_vec_info_type
+ && (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+ || SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP))
{
- tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ tree vectype = SLP_TREE_VECTYPE (node);
unsigned int nunits = vect_nunits_for_cost (vectype);
/* As PR103702 shows, it's possible that vectorizer wants to do
costings for only one unit here, it's no need to do any
@@ -5406,7 +5417,7 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
unsigned
rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind,
- stmt_vec_info stmt_info, slp_tree,
+ stmt_vec_info stmt_info, slp_tree node,
tree vectype, int misalign,
vect_cost_model_location where)
{
@@ -5424,7 +5435,7 @@ rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind,
retval = adjust_cost_for_freq (stmt_info, where, count * stmt_cost);
m_costs[where] += retval;
- update_target_cost_per_stmt (kind, stmt_info, where, orig_count);
+ update_target_cost_per_stmt (kind, stmt_info, node, where, orig_count);
}
return retval;
@@ -10309,15 +10320,18 @@ can_be_rotated_to_negative_lis (HOST_WIDE_INT c, int *rot)
/* case b. xx0..01..1xx: some of 15 x's (and some of 16 0's) are
rotated over the highest bit. */
- int pos_one = clz_hwi ((c << 16) >> 16);
- middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one));
- int middle_ones = clz_hwi (~(c << pos_one));
- if (middle_zeros >= 16 && middle_ones >= 33)
+ unsigned HOST_WIDE_INT uc = c;
+ int pos_one = clz_hwi ((HOST_WIDE_INT) (uc << 16) >> 16);
+ if (pos_one != 0)
{
- *rot = pos_one;
- return true;
+ middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one));
+ int middle_ones = clz_hwi (~(uc << pos_one));
+ if (middle_zeros >= 16 && middle_ones >= 33)
+ {
+ *rot = pos_one;
+ return true;
+ }
}
-
return false;
}
@@ -10434,7 +10448,8 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask)
if (lz >= HOST_BITS_PER_WIDE_INT)
return false;
- int middle_ones = clz_hwi (~(c << lz));
+ unsigned HOST_WIDE_INT uc = c;
+ int middle_ones = clz_hwi (~(uc << lz));
if (tz + lz + middle_ones >= ones
&& (tz - lz) < HOST_BITS_PER_WIDE_INT
&& tz < HOST_BITS_PER_WIDE_INT)
@@ -10468,7 +10483,7 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask)
if (!IN_RANGE (pos_first_1, 1, HOST_BITS_PER_WIDE_INT-1))
return false;
- middle_ones = clz_hwi (~c << pos_first_1);
+ middle_ones = clz_hwi ((~(unsigned HOST_WIDE_INT) c) << pos_first_1);
middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_first_1));
if (pos_first_1 < HOST_BITS_PER_WIDE_INT
&& middle_ones + middle_zeros < HOST_BITS_PER_WIDE_INT
@@ -10570,7 +10585,8 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c, int *num_insns)
{
/* li/lis; rldicX */
unsigned HOST_WIDE_INT imm = (c | ~mask);
- imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift));
+ if (shift != 0)
+ imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift));
count_or_emit_insn (temp, GEN_INT (imm));
if (shift != 0)
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 9c718ca..e31ee40 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1969,7 +1969,7 @@
[(set (match_dup 0) (plus:GPR (match_dup 1) (match_dup 3)))
(set (match_dup 0) (plus:GPR (match_dup 0) (match_dup 4)))]
{
- HOST_WIDE_INT val = INTVAL (operands[2]);
+ unsigned HOST_WIDE_INT val = UINTVAL (operands[2]);
HOST_WIDE_INT low = sext_hwi (val, 16);
HOST_WIDE_INT rest = trunc_int_for_mode (val - low, <MODE>mode);
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index d760a7e..6becad1 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -128,6 +128,8 @@ extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx);
extern void s390_expand_vec_init (rtx, rtx);
extern rtx s390_expand_merge_perm_const (machine_mode, bool);
extern void s390_expand_merge (rtx, rtx, rtx, bool);
+extern void s390_expand_int_spaceship (rtx, rtx, rtx, rtx);
+extern void s390_expand_fp_spaceship (rtx, rtx, rtx, rtx);
extern rtx s390_build_signbit_mask (machine_mode);
extern rtx s390_return_addr_rtx (int, rtx);
extern rtx s390_back_chain_rtx (void);
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index b5e636c..012b6db 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -8213,6 +8213,167 @@ s390_expand_atomic (machine_mode mode, enum rtx_code code,
NULL_RTX, 1, OPTAB_DIRECT), 1);
}
+/* Expand integer op0 = op1 <=> op2, i.e.,
+ op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : 1.
+
+ Signedness is specified by op3. If op3 equals 1, then perform an unsigned
+ comparison, and if op3 equals -1, then perform a signed comparison.
+
+ For integer comparisons we strive for a sequence like
+ CR[L] ; LHI ; LOCHIL ; LOCHIH
+ where the first three instructions fit into a group. */
+
+void
+s390_expand_int_spaceship (rtx op0, rtx op1, rtx op2, rtx op3)
+{
+ gcc_assert (op3 == const1_rtx || op3 == constm1_rtx);
+
+ rtx cc, cond_lt, cond_gt;
+ machine_mode cc_mode;
+ machine_mode mode = GET_MODE (op1);
+
+ /* Prior VXE3 emulate a 128-bit comparison by breaking it up into three
+ comparisons. First test the high halfs. In case they equal, then test
+ the low halfs. Finally, test for equality. Depending on the results
+ make use of LOCs. */
+ if (mode == TImode && !TARGET_VXE3)
+ {
+ gcc_assert (TARGET_VX);
+ op1
+ = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op1, TImode, 0));
+ op2
+ = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op2, TImode, 0));
+ rtx lab = gen_label_rtx ();
+ rtx ccz = gen_rtx_REG (CCZmode, CC_REGNUM);
+ /* Compare high halfs for equality.
+ VEC[L]G op1, op2 sets
+ CC1 if high(op1) < high(op2)
+ and
+ CC2 if high(op1) > high(op2). */
+ machine_mode cc_mode = op3 == const1_rtx ? CCUmode : CCSmode;
+ rtx lane0 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+ emit_insn (gen_rtx_SET (
+ gen_rtx_REG (cc_mode, CC_REGNUM),
+ gen_rtx_COMPARE (cc_mode,
+ gen_rtx_VEC_SELECT (DImode, op1, lane0),
+ gen_rtx_VEC_SELECT (DImode, op2, lane0))));
+ s390_emit_jump (lab, gen_rtx_NE (CCZmode, ccz, const0_rtx));
+ /* At this point we know that the high halfs equal.
+ VCHLGS op2, op1 sets CC1 if low(op1) < low(op2) */
+ emit_insn (gen_rtx_PARALLEL (
+ VOIDmode,
+ gen_rtvec (2,
+ gen_rtx_SET (gen_rtx_REG (CCVIHUmode, CC_REGNUM),
+ gen_rtx_COMPARE (CCVIHUmode, op2, op1)),
+ gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (V2DImode)))));
+ emit_label (lab);
+ emit_insn (gen_rtx_SET (op0, const1_rtx));
+ emit_insn (
+ gen_movsicc (op0,
+ gen_rtx_LTU (CCUmode, gen_rtx_REG (CCUmode, CC_REGNUM),
+ const0_rtx),
+ constm1_rtx, op0));
+ /* Deal with the case where both halfs equal. */
+ emit_insn (gen_rtx_PARALLEL (
+ VOIDmode,
+ gen_rtvec (2,
+ gen_rtx_SET (gen_rtx_REG (CCVEQmode, CC_REGNUM),
+ gen_rtx_COMPARE (CCVEQmode, op1, op2)),
+ gen_rtx_SET (gen_reg_rtx (V2DImode),
+ gen_rtx_EQ (V2DImode, op1, op2)))));
+ emit_insn (gen_movsicc (op0, gen_rtx_EQ (CCZmode, ccz, const0_rtx),
+ const0_rtx, op0));
+ return;
+ }
+
+ if (mode == QImode || mode == HImode)
+ {
+ rtx_code extend = op3 == const1_rtx ? ZERO_EXTEND : SIGN_EXTEND;
+ op1 = simplify_gen_unary (extend, SImode, op1, mode);
+ op1 = force_reg (SImode, op1);
+ op2 = simplify_gen_unary (extend, SImode, op2, mode);
+ op2 = force_reg (SImode, op2);
+ mode = SImode;
+ }
+
+ if (op3 == const1_rtx)
+ {
+ cc_mode = CCUmode;
+ cc = gen_rtx_REG (cc_mode, CC_REGNUM);
+ cond_lt = gen_rtx_LTU (mode, cc, const0_rtx);
+ cond_gt = gen_rtx_GTU (mode, cc, const0_rtx);
+ }
+ else
+ {
+ cc_mode = CCSmode;
+ cc = gen_rtx_REG (cc_mode, CC_REGNUM);
+ cond_lt = gen_rtx_LT (mode, cc, const0_rtx);
+ cond_gt = gen_rtx_GT (mode, cc, const0_rtx);
+ }
+
+ emit_insn (gen_rtx_SET (cc, gen_rtx_COMPARE (cc_mode, op1, op2)));
+ emit_move_insn (op0, const0_rtx);
+ emit_insn (gen_movsicc (op0, cond_lt, constm1_rtx, op0));
+ emit_insn (gen_movsicc (op0, cond_gt, const1_rtx, op0));
+}
+
+/* Expand floating-point op0 = op1 <=> op2, i.e.,
+ op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : op1 > op2 ? 1 : 2.
+
+ If op3 equals const0_rtx, then we are interested in the compare only (see
+ test spaceship-fp-4.c). Otherwise, op3 is a CONST_INT different than
+ const1_rtx and constm1_rtx which is used in order to set op0 for unordered.
+
+ Emit a branch-only solution, i.e., let if-convert fold the branches into
+ LOCs if applicable. This has the benefit that the solution is also
+ applicable if we are only interested in the compare, i.e., if op3 equals
+ const0_rtx.
+ */
+
+void
+s390_expand_fp_spaceship (rtx op0, rtx op1, rtx op2, rtx op3)
+{
+ gcc_assert (op3 != const1_rtx && op3 != constm1_rtx);
+
+ machine_mode mode = GET_MODE (op1);
+ machine_mode cc_mode = s390_select_ccmode (LTGT, op1, op2);
+ rtx cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+ rtx cond_unordered = gen_rtx_UNORDERED (mode, cc_reg, const0_rtx);
+ rtx cond_eq = gen_rtx_EQ (mode, cc_reg, const0_rtx);
+ rtx cond_gt = gen_rtx_GT (mode, cc_reg, const0_rtx);
+ rtx_insn *insn;
+ rtx l_unordered = gen_label_rtx ();
+ rtx l_eq = gen_label_rtx ();
+ rtx l_gt = gen_label_rtx ();
+ rtx l_end = gen_label_rtx ();
+
+ s390_emit_compare (VOIDmode, LTGT, op1, op2);
+ if (!flag_finite_math_only)
+ {
+ insn = s390_emit_jump (l_unordered, cond_unordered);
+ add_reg_br_prob_note (insn, profile_probability::very_unlikely ());
+ }
+ insn = s390_emit_jump (l_eq, cond_eq);
+ add_reg_br_prob_note (insn, profile_probability::unlikely ());
+ insn = s390_emit_jump (l_gt, cond_gt);
+ add_reg_br_prob_note (insn, profile_probability::even ());
+ emit_move_insn (op0, constm1_rtx);
+ emit_jump (l_end);
+ emit_label (l_eq);
+ emit_move_insn (op0, const0_rtx);
+ emit_jump (l_end);
+ emit_label (l_gt);
+ emit_move_insn (op0, const1_rtx);
+ if (!flag_finite_math_only)
+ {
+ emit_jump (l_end);
+ emit_label (l_unordered);
+ rtx unord_val = op3 == const0_rtx ? const2_rtx : op3;
+ emit_move_insn (op0, unord_val);
+ }
+ emit_label (l_end);
+}
+
/* This is called from dwarf2out.cc via TARGET_ASM_OUTPUT_DWARF_DTPREL.
We need to emit DTP-relative relocations. */
@@ -16874,8 +17035,9 @@ s390_valid_target_attribute_inner_p (tree args,
generate_option (opt, NULL, value, CL_TARGET, &decoded);
s390_handle_option (opts, new_opts_set, &decoded, input_location);
set_option (opts, new_opts_set, opt, value,
- p + opt_len, DK_UNSPECIFIED, input_location,
- global_dc);
+ p + opt_len,
+ static_cast<int> (diagnostics::kind::unspecified),
+ input_location, global_dc);
}
else
{
@@ -16892,8 +17054,9 @@ s390_valid_target_attribute_inner_p (tree args,
arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
if (arg_ok)
set_option (opts, new_opts_set, opt, value,
- p + opt_len, DK_UNSPECIFIED, input_location,
- global_dc);
+ p + opt_len,
+ static_cast<int> (diagnostics::kind::unspecified),
+ input_location, global_dc);
else
{
error ("attribute %<target%> argument %qs is unknown", orig_p);
@@ -17345,13 +17508,15 @@ static bool
s390_support_vector_misalignment (machine_mode mode ATTRIBUTE_UNUSED,
const_tree type ATTRIBUTE_UNUSED,
int misalignment ATTRIBUTE_UNUSED,
- bool is_packed ATTRIBUTE_UNUSED)
+ bool is_packed ATTRIBUTE_UNUSED,
+ bool is_gather_scatter ATTRIBUTE_UNUSED)
{
if (TARGET_VX)
return true;
return default_builtin_support_vector_misalignment (mode, type, misalignment,
- is_packed);
+ is_packed,
+ is_gather_scatter);
}
/* The vector ABI requires vector types to be aligned on an 8 byte
@@ -17843,9 +18008,11 @@ f_constraint_p (const char *constraint)
for (size_t i = 0, c_len = strlen (constraint); i < c_len;
i += CONSTRAINT_LEN (constraint[i], constraint + i))
{
- if (constraint[i] == 'f')
+ if (constraint[i] == 'f'
+ || (constraint[i] == '{' && constraint[i + 1] == 'f'))
seen_f_p = true;
- if (constraint[i] == 'v')
+ if (constraint[i] == 'v'
+ || (constraint[i] == '{' && constraint[i + 1] == 'v'))
seen_v_p = true;
}
@@ -17935,7 +18102,8 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
continue;
bool allows_mem, allows_reg, is_inout;
bool ok = parse_output_constraint (&constraint, i, ninputs, noutputs,
- &allows_mem, &allows_reg, &is_inout);
+ &allows_mem, &allows_reg, &is_inout,
+ nullptr);
gcc_assert (ok);
if (!f_constraint_p (constraint))
/* Long double with a constraint other than "=f" - nothing to do. */
@@ -17980,7 +18148,7 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
bool allows_mem, allows_reg;
bool ok = parse_input_constraint (&constraint, i, ninputs, noutputs, 0,
constraints.address (), &allows_mem,
- &allows_reg);
+ &allows_reg, nullptr);
gcc_assert (ok);
if (!f_constraint_p (constraint))
/* Long double with a constraint other than "f" (or "=f" for inout
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 1edbfde..8cc48b0 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -1527,6 +1527,27 @@
operands[0] = SET_DEST (PATTERN (curr_insn));
})
+; Restrict spaceship optab to z13 or later since there we have
+; LOAD HALFWORD IMMEDIATE ON CONDITION.
+
+(define_mode_iterator SPACESHIP_INT [(TI "TARGET_VX") DI SI HI QI])
+(define_expand "spaceship<mode>4"
+ [(match_operand:SI 0 "register_operand")
+ (match_operand:SPACESHIP_INT 1 "register_operand")
+ (match_operand:SPACESHIP_INT 2 "register_operand")
+ (match_operand:SI 3 "const_int_operand")]
+ "TARGET_Z13 && TARGET_64BIT"
+ "s390_expand_int_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;")
+
+(define_mode_iterator SPACESHIP_BFP [TF DF SF])
+(define_expand "spaceship<mode>4"
+ [(match_operand:SI 0 "register_operand")
+ (match_operand:SPACESHIP_BFP 1 "register_operand")
+ (match_operand:SPACESHIP_BFP 2 "register_operand")
+ (match_operand:SI 3 "const_int_operand")]
+ "TARGET_Z13 && TARGET_64BIT && TARGET_HARD_FLOAT"
+ "s390_expand_fp_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;")
+
; (TF|DF|SF|TD|DD|SD) instructions
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index b75cec1..d75cba4 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -601,8 +601,8 @@ constantpool_address_p (const_rtx addr)
/* Make sure the address is word aligned. */
offset = XEXP (addr, 1);
- if ((!CONST_INT_P (offset))
- || ((INTVAL (offset) & 3) != 0))
+ if (! CONST_INT_P (offset)
+ || (INTVAL (offset) & 3) != 0)
return false;
sym = XEXP (addr, 0);
@@ -611,6 +611,7 @@ constantpool_address_p (const_rtx addr)
if (SYMBOL_REF_P (sym)
&& CONSTANT_POOL_ADDRESS_P (sym))
return true;
+
return false;
}
@@ -4694,29 +4695,56 @@ xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code,
}
}
+/* Return TRUE if the specified insn corresponds to one or more L32R machine
+ instructions. */
+
static bool
xtensa_is_insn_L32R_p (const rtx_insn *insn)
{
- rtx x = PATTERN (insn);
+ rtx pat, dest, src;
+ machine_mode mode;
- if (GET_CODE (x) != SET)
+ /* RTX insns that are not "(set (reg) ...)" cannot become L32R instructions:
+ - it is permitted to apply PATTERN() to the insn without validation.
+ See insn_cost() in gcc/rtlanal.cc.
+ - it is used register_operand() instead of REG() to identify things that
+ don't look like REGs but will eventually become so as well. */
+ if (GET_CODE (pat = PATTERN (insn)) != SET
+ || ! register_operand (dest = SET_DEST (pat), VOIDmode))
return false;
- x = XEXP (x, 1);
- if (MEM_P (x))
- {
- x = XEXP (x, 0);
- return (SYMBOL_REF_P (x) || CONST_INT_P (x))
- && CONSTANT_POOL_ADDRESS_P (x);
- }
-
- /* relaxed MOVI instructions, that will be converted to L32R by the
- assembler. */
- if (CONST_INT_P (x)
- && ! xtensa_simm12b (INTVAL (x)))
+ /* If the source is a reference to a literal pool entry, then the insn
+ obviously corresponds to an L32R instruction. */
+ if (constantpool_mem_p (src = SET_SRC (pat)))
return true;
- return false;
+ /* Similarly, an insn whose source is not a constant obviously does not
+ correspond to L32R. */
+ if (! CONSTANT_P (src))
+ return false;
+
+ /* If the source is a CONST_INT whose value fits into signed 12 bits, then
+ the insn corresponds to a MOVI instruction (rather than an L32R one),
+ regardless of the configuration of TARGET_CONST16 or
+ TARGET_AUTOLITPOOLS. Note that the destination register can be non-
+ SImode. */
+ if (((mode = GET_MODE (dest)) == SImode
+ || mode == HImode || mode == SFmode)
+ && CONST_INT_P (src) && xtensa_simm12b (INTVAL (src)))
+ return false;
+
+ /* If TARGET_CONST16 is configured, constants of the remaining forms
+ correspond to pairs of CONST16 instructions, not L32R. */
+ if (TARGET_CONST16)
+ return false;
+
+ /* The last remaining form of constant is one of the following:
+ - CONST_INTs with large values
+ - floating-point constants
+ - symbolic constants
+ and is all handled by a relaxed MOVI instruction, which is later
+ converted to an L32R instruction by the assembler. */
+ return true;
}
/* Compute a relative costs of RTL insns. This is necessary in order to
@@ -4725,7 +4753,7 @@ xtensa_is_insn_L32R_p (const rtx_insn *insn)
static int
xtensa_insn_cost (rtx_insn *insn, bool speed)
{
- if (!(recog_memoized (insn) < 0))
+ if (! (recog_memoized (insn) < 0))
{
int len = get_attr_length (insn);
@@ -4738,7 +4766,7 @@ xtensa_insn_cost (rtx_insn *insn, bool speed)
/* "L32R" may be particular slow (implementation-dependent). */
if (xtensa_is_insn_L32R_p (insn))
- return COSTS_N_INSNS (1 + xtensa_extra_l32r_costs);
+ return COSTS_N_INSNS ((1 + xtensa_extra_l32r_costs) * n);
/* Cost based on the pipeline model. */
switch (get_attr_type (insn))
@@ -4783,7 +4811,7 @@ xtensa_insn_cost (rtx_insn *insn, bool speed)
{
/* "L32R" itself plus constant in litpool. */
if (xtensa_is_insn_L32R_p (insn))
- len = 3 + 4;
+ len += (len / 3) * 4;
/* Consider fractional instruction length (for example, ".n"
short instructions or "L32R" litpool constants. */
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 029be99..629dfdd 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -1297,7 +1297,10 @@
std::swap (operands[0], operands[1]);
std::swap (operands[2], operands[3]);
}
-})
+}
+ [(set_attr "type" "move,move,load,load,store")
+ (set_attr "mode" "DI")
+ (set_attr "length" "6,12,6,6,6")])
(define_split
[(set (match_operand:DI 0 "register_operand")
@@ -1344,7 +1347,7 @@
%v0s32i\t%1, %0
rsr\t%0, ACCLO
wsr\t%1, ACCLO"
- [(set_attr "type" "move,move,move,load,store,store,move,move,move,move,move,load,load,store,rsr,wsr")
+ [(set_attr "type" "move,move,move,load,store,store,move,move,move,load,move,load,load,store,rsr,wsr")
(set_attr "mode" "SI")
(set_attr "length" "2,2,2,2,2,2,3,3,3,3,6,3,3,3,3,3")])
@@ -1410,7 +1413,7 @@
%v0s16i\t%1, %0
rsr\t%0, ACCLO
wsr\t%1, ACCLO"
- [(set_attr "type" "move,move,move,move,move,load,load,store,rsr,wsr")
+ [(set_attr "type" "move,move,move,move,load,load,load,store,rsr,wsr")
(set_attr "mode" "HI")
(set_attr "length" "2,2,3,3,3,3,3,3,3,3")])
@@ -1519,7 +1522,7 @@
const16\t%0, %t1\;const16\t%0, %b1
%v1l32i\t%0, %1
%v0s32i\t%1, %0"
- [(set_attr "type" "farith,fload,fstore,move,load,load,store,move,farith,farith,move,move,load,store")
+ [(set_attr "type" "farith,fload,fstore,move,load,load,store,move,farith,farith,load,move,load,store")
(set_attr "mode" "SF")
(set_attr "length" "3,3,3,2,3,2,2,3,3,3,3,6,3,3")])
@@ -1643,7 +1646,10 @@
std::swap (operands[0], operands[1]);
std::swap (operands[2], operands[3]);
}
-})
+}
+ [(set_attr "type" "move,load,move,load,load,store")
+ (set_attr "mode" "DF")
+ (set_attr "length" "6,6,12,6,6,6")])
;; Block moves