aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/i386/i386-features.cc39
-rw-r--r--gcc/config/i386/i386.cc23
-rw-r--r--gcc/config/i386/i386.h4
-rw-r--r--gcc/config/i386/x86-tune-costs.h133
-rw-r--r--gcc/config/riscv/autovec-opt.md23
-rw-r--r--gcc/config/riscv/bitmanip.md74
-rw-r--r--gcc/config/riscv/predicates.md4
-rw-r--r--gcc/config/riscv/riscv-opts.h2
-rw-r--r--gcc/config/riscv/riscv-protos.h1
-rw-r--r--gcc/config/riscv/riscv-vector-costs.cc2
-rw-r--r--gcc/config/riscv/riscv.cc54
-rw-r--r--gcc/config/riscv/riscv.md20
-rw-r--r--gcc/config/riscv/riscv.opt8
-rw-r--r--gcc/config/riscv/vector-iterators.md4
14 files changed, 378 insertions, 13 deletions
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 31f3ee2..1ba5ac4 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3309,8 +3309,16 @@ ix86_get_vector_load_mode (unsigned int size)
mode = V64QImode;
else if (size == 32)
mode = V32QImode;
- else
+ else if (size == 16)
mode = V16QImode;
+ else if (size == 8)
+ mode = V8QImode;
+ else if (size == 4)
+ mode = V4QImode;
+ else if (size == 2)
+ mode = V2QImode;
+ else
+ gcc_unreachable ();
return mode;
}
@@ -3338,13 +3346,36 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
if (SUBREG_P (dest) || mode == vector_mode)
replace = vector_const;
else
- replace = gen_rtx_SUBREG (mode, vector_const, 0);
+ {
+ unsigned int size = GET_MODE_SIZE (mode);
+ if (size < ix86_regmode_natural_size (mode))
+ {
+ /* If the mode size is smaller than its natural size,
+ first insert an extra move with a QI vector SUBREG
+ of the same size to avoid validate_subreg failure. */
+ machine_mode vmode = ix86_get_vector_load_mode (size);
+ rtx vreg;
+ if (mode == vmode)
+ vreg = vector_const;
+ else
+ {
+ vreg = gen_reg_rtx (vmode);
+ rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
+ rtx pat = gen_rtx_SET (vreg, vsubreg);
+ rtx_insn *vinsn = emit_insn_before (pat, insn);
+ df_insn_rescan (vinsn);
+ }
+ replace = gen_rtx_SUBREG (mode, vreg, 0);
+ }
+ else
+ replace = gen_rtx_SUBREG (mode, vector_const, 0);
+ }
- /* NB: Don't run recog_memoized here since vector SUBREG may not
- be valid. Let LRA handle vector SUBREG. */
SET_SRC (set) = replace;
/* Drop possible dead definitions. */
PATTERN (insn) = set;
+ INSN_CODE (insn) = -1;
+ recog_memoized (insn);
df_insn_rescan (insn);
}
}
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f28c92a..bef95ea 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -12320,6 +12320,7 @@ get_thread_pointer (machine_mode tp_mode, bool to_reg)
static GTY(()) rtx ix86_tls_index_symbol;
+#if TARGET_WIN32_TLS
static rtx
ix86_tls_index (void)
{
@@ -12331,6 +12332,7 @@ ix86_tls_index (void)
else
return ix86_tls_index_symbol;
}
+#endif
/* Construct the SYMBOL_REF for the tls_get_addr function. */
@@ -22792,6 +22794,27 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
else
*total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
return false;
+ case FLOAT:
+ case UNSIGNED_FLOAT:
+ if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ /* TODO: We do not have cost tables for x87. */
+ *total = cost->fadd;
+ else if (VECTOR_MODE_P (mode))
+ *total = ix86_vec_cost (mode, cost->cvtpi2ps);
+ else
+ *total = cost->cvtsi2ss;
+ return false;
+
+ case FIX:
+ case UNSIGNED_FIX:
+ if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ /* TODO: We do not have cost tables for x87. */
+ *total = cost->fadd;
+ else if (VECTOR_MODE_P (mode))
+ *total = ix86_vec_cost (mode, cost->cvtps2pi);
+ else
+ *total = cost->cvtss2si;
+ return false;
case ABS:
/* SSE requires memory load for the constant operand. It may make
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 02bf357..6a38de3 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -213,6 +213,10 @@ struct processor_costs {
such as VCVTPD2PS with larger reg in ymm. */
const int vcvtps2pd512; /* cost 512bit packed FP conversions,
such as VCVTPD2PS with larger reg in zmm. */
+ const int cvtsi2ss; /* cost of CVTSI2SS instruction. */
+ const int cvtss2si; /* cost of CVT(T)SS2SI instruction. */
+ const int cvtpi2ps; /* cost of CVTPI2PS instruction. */
+ const int cvtps2pi; /* cost of CVT(T)PS2PI instruction. */
const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp;
/* Specify reassociation width for integer,
fp, vector integer and vector fp
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index cddcf61..6cce70a 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -134,6 +134,11 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (4), /* cost of CVTSS2SD etc. */
COSTS_N_BYTES (4), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_BYTES (6), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_BYTES (4), /* cost of CVTSI2SS instruction. */
+ COSTS_N_BYTES (4), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_BYTES (4), /* cost of CVTPI2PS instruction. */
+ COSTS_N_BYTES (4), /* cost of CVT(T)PS2PI instruction. */
+
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
ix86_size_memcpy,
ix86_size_memset,
@@ -249,6 +254,10 @@ struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (27), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (54), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (108), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (27), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (27), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
i386_memcpy,
i386_memset,
@@ -365,6 +374,10 @@ struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (8), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (16), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (32), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (27), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (27), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
i486_memcpy,
i486_memset,
@@ -479,6 +492,10 @@ struct processor_costs pentium_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium_memcpy,
pentium_memset,
@@ -586,6 +603,10 @@ struct processor_costs lakemont_cost = {
COSTS_N_INSNS (5), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (10), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (20), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (5), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (5), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (5), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium_memcpy,
pentium_memset,
@@ -708,6 +729,10 @@ struct processor_costs pentiumpro_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentiumpro_memcpy,
pentiumpro_memset,
@@ -821,6 +846,10 @@ struct processor_costs geode_cost = {
COSTS_N_INSNS (6), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (12), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (24), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
geode_memcpy,
geode_memset,
@@ -937,6 +966,10 @@ struct processor_costs k6_cost = {
COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (4), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (8), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (2), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (2), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (2), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (2), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
k6_memcpy,
k6_memset,
@@ -1054,6 +1087,10 @@ struct processor_costs athlon_cost = {
COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (8), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (16), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (4), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
athlon_memcpy,
athlon_memset,
@@ -1180,6 +1217,10 @@ struct processor_costs k8_cost = {
COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (8), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (16), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (10), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
k8_memcpy,
k8_memset,
@@ -1314,6 +1355,10 @@ struct processor_costs amdfam10_cost = {
COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (8), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (16), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (8), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
amdfam10_memcpy,
amdfam10_memset,
@@ -1441,6 +1486,10 @@ const struct processor_costs bdver_cost = {
COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (7), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (14), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (13), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
bdver_memcpy,
bdver_memset,
@@ -1593,6 +1642,10 @@ struct processor_costs znver1_cost = {
/* Real latency is 4, but for split regs multiply cost of half op by 2. */
COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (8), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (7), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
and it can execute 2 integer additions and 2 multiplications thus
reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
@@ -1755,6 +1808,10 @@ struct processor_costs znver2_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (10), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (7), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
@@ -1893,6 +1950,10 @@ struct processor_costs znver3_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (10), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
@@ -2034,6 +2095,10 @@ struct processor_costs znver4_cost = {
COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */
/* Real latency is 6, but for split regs multiply cost of half op by 2. */
COSTS_N_INSNS (10), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
@@ -2188,6 +2253,10 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (5), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
/* Zen5 can execute:
- integer ops: 6 per cycle, at most 3 multiplications.
latency 1 for additions, 3 for multiplications (pipelined)
@@ -2330,6 +2399,10 @@ struct processor_costs skylake_cost = {
COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (4), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
skylake_memcpy,
skylake_memset,
@@ -2462,6 +2535,10 @@ struct processor_costs icelake_cost = {
COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (2), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
icelake_memcpy,
icelake_memset,
@@ -2588,6 +2665,10 @@ struct processor_costs alderlake_cost = {
COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (2), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
alderlake_memcpy,
alderlake_memset,
@@ -2707,6 +2788,10 @@ const struct processor_costs btver1_cost = {
COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (7), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (14), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (13), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
btver1_memcpy,
btver1_memset,
@@ -2823,6 +2908,10 @@ const struct processor_costs btver2_cost = {
COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (7), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (14), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (13), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
btver2_memcpy,
btver2_memset,
@@ -2938,6 +3027,10 @@ struct processor_costs pentium4_cost = {
COSTS_N_INSNS (10), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (20), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (40), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (20), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (17), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium4_memcpy,
pentium4_memset,
@@ -3056,6 +3149,10 @@ struct processor_costs nocona_cost = {
COSTS_N_INSNS (10), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (20), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (40), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (20), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (17), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
nocona_memcpy,
nocona_memset,
@@ -3172,6 +3269,10 @@ struct processor_costs atom_cost = {
COSTS_N_INSNS (6), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (12), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (24), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (7), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (10), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
atom_memcpy,
atom_memset,
@@ -3288,6 +3389,10 @@ struct processor_costs slm_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (5), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (5), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
slm_memcpy,
slm_memset,
@@ -3418,6 +3523,10 @@ struct processor_costs tremont_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
tremont_memcpy,
tremont_memset,
@@ -3534,6 +3643,10 @@ struct processor_costs intel_cost = {
COSTS_N_INSNS (8), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (16), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (32), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (8), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (8), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (8), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */
1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
intel_memcpy,
intel_memset,
@@ -3655,6 +3768,10 @@ struct processor_costs lujiazui_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
lujiazui_memcpy,
lujiazui_memset,
@@ -3774,6 +3891,10 @@ struct processor_costs yongfeng_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
yongfeng_memcpy,
yongfeng_memset,
@@ -3893,6 +4014,10 @@ struct processor_costs shijidadao_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
shijidadao_memcpy,
shijidadao_memset,
@@ -4020,6 +4145,10 @@ struct processor_costs generic_cost = {
COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (4), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (5), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
generic_memcpy,
generic_memset,
@@ -4152,6 +4281,10 @@ struct processor_costs core_cost = {
COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */
COSTS_N_INSNS (2), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
core_memcpy,
core_memset,
diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index 0c3b0cc..7cf7e8a 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1673,3 +1673,26 @@
DONE;
}
[(set_attr "type" "vandn")])
+
+
+;; =============================================================================
+;; Combine vec_duplicate + op.vv to op.vx
+;; Include
+;; - vadd.vx
+;; =============================================================================
+(define_insn_and_split "*<optab>_vx_<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand")
+ (any_int_binop_no_shift_vx:V_VLSI
+ (vec_duplicate:V_VLSI
+ (match_operand:<VEL> 1 "register_operand"))
+ (match_operand:V_VLSI 2 "<binop_rhs2_predicate>")))]
+ "TARGET_VECTOR && can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+ {
+ rtx ops[] = {operands[0], operands[2], operands[1]};
+ riscv_vector::emit_vlmax_insn (code_for_pred_scalar (<CODE>, <MODE>mode),
+ riscv_vector::BINARY_OP, ops);
+ }
+ [(set_attr "type" "vialu")])
diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index 20d03dc..95df533 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -1302,3 +1302,77 @@
}
DONE;
})
+
+;; More forms of single bit extraction. The RISC-V port does not
+;; define SHIFT_COUNT_TRUNCATED so we need forms where the bit position
+;; is masked.
+;;
+;; We could in theory use this for rv32 as well, but it probably does
+;; not occur in practice. The bit position would need to be QI/HI mode,
+;; otherwise we would not need the zero extension.
+;;
+;; One could also argue that the zero extension is redundant and should
+;; have been optimized away during RTL simplification.
+(define_insn "*bextdi_position_ze_masked"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (zero_extract:DI (match_operand:DI 1 "register_operand" "r")
+ (const_int 1)
+ (zero_extend:DI
+ (and:SI (match_operand:SI 2 "register_operand" "r")
+ (const_int 63)))))]
+ "TARGET_64BIT && TARGET_ZBS"
+ "bext\t%0,%1,%2"
+ [(set_attr "type" "bitmanip")])
+
+;; Same as above, but without the extraneous zero_extend.
+(define_insn "*bextdi_position_ze_masked"
+ [(set (match_operand:X 0 "register_operand" "=r")
+ (zero_extract:X
+ (match_operand:X 1 "register_operand" "r")
+ (const_int 1)
+ (and:X (match_operand:SI 2 "register_operand" "r")
+ (match_operand:SI 3 "bitpos_mask_operand" "n"))))]
+ "TARGET_64BIT && TARGET_ZBS"
+ "bext\t%0,%1,%2"
+ [(set_attr "type" "bitmanip")])
+
+
+;; Single bit extraction by first shifting it into the sign bit, then
+;; shifting it down to the low bit.
+(define_insn "*bext<mode>_position_masked"
+ [(set (match_operand:X 0 "register_operand" "=r")
+ (lshiftrt:X (ashift:X (match_operand:X 1 "register_operand" "r")
+ (match_operand:QI 2 "register_operand" "r"))
+ (match_operand:X 3 "bitpos_mask_operand" "n")))]
+ "TARGET_ZBS"
+ "bext\t%0,%1,%2"
+ [(set_attr "type" "bitmanip")])
+
+;; Single bit extraction by shifting into the low bit, but with the
+;; position formed with a subreg of a mask.
+(define_insn "*bext<mode>_position_masked_subreg"
+ [(set (match_operand:X 0 "register_operand" "=r")
+ (lshiftrt:X
+ (ashift:X (match_operand:X 1 "register_operand" "r")
+ (subreg:QI
+ (and:X (match_operand:X 2 "register_operand" "r")
+ (match_operand:X 3 "bitpos_mask_operand" "n")) 0))
+ (match_operand:X 4 "bitpos_mask_operand" "n")))]
+ "TARGET_ZBS"
+ "bext\t%0,%1,%2"
+ [(set_attr "type" "bitmanip")])
+
+;; This has shown up in testing. In particular we end up with an
+;; immediate input. We can load that into a register and target
+;; one of the above bext patterns.
+(define_split
+ [(set (match_operand:X 0 "register_operand")
+ (and:X (lshiftrt:X (match_operand 1 "immediate_operand")
+ (match_operand:QI 2 "register_operand"))
+ (const_int 1)))
+ (clobber (match_operand:X 3 "register_operand"))]
+ ""
+ [(set (match_dup 3) (match_dup 1))
+ (set (match_dup 0) (zero_extract:X (match_dup 3)
+ (const_int 1)
+ (zero_extend:X (match_dup 2))))])
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index f26bafc..c9a638c 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -685,3 +685,7 @@
(and (match_operand 0 "register_operand")
(match_test "REGNO (op) == RETURN_ADDR_REGNUM
|| REGNO (op) == T0_REGNUM")))
+
+(define_predicate "bitpos_mask_operand"
+ (and (match_code "const_int")
+ (match_test "TARGET_64BIT ? INTVAL (op) == 63 : INTVAL (op) == 31")))
diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
index 26fe228..9766b89 100644
--- a/gcc/config/riscv/riscv-opts.h
+++ b/gcc/config/riscv/riscv-opts.h
@@ -162,4 +162,6 @@ enum riscv_tls_type {
#define TARGET_VECTOR_AUTOVEC_SEGMENT \
(TARGET_VECTOR && riscv_mautovec_segment)
+#define GPR2VR_COST_UNPROVIDED -1
+
#endif /* ! GCC_RISCV_OPTS_H */
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 2e88990..b0d5bbb 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -836,6 +836,7 @@ struct riscv_tune_info {
const struct riscv_tune_info *
riscv_parse_tune (const char *, bool);
const cpu_vector_cost *get_vector_costs ();
+int get_gr2vr_cost ();
enum
{
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 167375c..c28eecd 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1121,7 +1121,7 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
{
case scalar_to_vec:
stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
- : costs->regmove->GR2VR);
+ : get_gr2vr_cost ());
break;
case vec_to_scalar:
stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index a065732..3ee88db 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -3863,7 +3863,40 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
Cost Model need to be well analyzed and supported in the future. */
if (riscv_v_ext_mode_p (mode))
{
- *total = COSTS_N_INSNS (1);
+ int gr2vr_cost = get_gr2vr_cost ();
+
+ switch (outer_code)
+ {
+ case SET:
+ {
+ switch (GET_CODE (x))
+ {
+ case VEC_DUPLICATE:
+ *total = gr2vr_cost * COSTS_N_INSNS (1);
+ break;
+ case PLUS:
+ {
+ rtx op_0 = XEXP (x, 0);
+ rtx op_1 = XEXP (x, 1);
+
+ if (GET_CODE (op_0) == VEC_DUPLICATE
+ || GET_CODE (op_1) == VEC_DUPLICATE)
+ *total = (gr2vr_cost + 1) * COSTS_N_INSNS (1);
+ else
+ *total = COSTS_N_INSNS (1);
+ }
+ break;
+ default:
+ *total = COSTS_N_INSNS (1);
+ break;
+ }
+ }
+ break;
+ default:
+ *total = COSTS_N_INSNS (1);
+ break;
+ }
+
return true;
}
@@ -9690,7 +9723,7 @@ riscv_register_move_cost (machine_mode mode,
if (to == V_REGS)
{
if (from_is_gpr)
- return get_vector_costs ()->regmove->GR2VR;
+ return get_gr2vr_cost ();
else if (from_is_fpr)
return get_vector_costs ()->regmove->FR2VR;
}
@@ -12540,6 +12573,21 @@ get_vector_costs ()
return costs;
}
+/* Return the cost of operation that move from gpr to vr.
+ It will take the value of --param=gpr2vr_cost if it is provided.
+ Or the default regmove->GR2VR will be returned. */
+
+int
+get_gr2vr_cost ()
+{
+ int cost = get_vector_costs ()->regmove->GR2VR;
+
+ if (gpr2vr_cost != GPR2VR_COST_UNPROVIDED)
+ cost = gpr2vr_cost;
+
+ return cost;
+}
+
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
@@ -12606,7 +12654,7 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
{
/* TODO: This is too pessimistic in case we can splat. */
int regmove_cost = fp ? costs->regmove->FR2VR
- : costs->regmove->GR2VR;
+ : get_gr2vr_cost ();
return (regmove_cost + common_costs->scalar_to_vec_cost)
* estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
}
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 15c89ff..259997f 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -3173,15 +3173,25 @@
"#"
"&& reload_completed"
[(set (match_dup 4) (lshiftrt:X (subreg:X (match_dup 2) 0) (match_dup 6)))
- (set (match_dup 4) (and:X (match_dup 4) (match_dup 7)))
+ (set (match_dup 4) (match_dup 8))
(set (pc) (if_then_else (match_op_dup 1 [(match_dup 4) (const_int 0)])
(label_ref (match_dup 0)) (pc)))]
{
- HOST_WIDE_INT mask = INTVAL (operands[3]);
- int trailing = ctz_hwi (mask);
+ HOST_WIDE_INT mask = INTVAL (operands[3]);
+ int trailing = ctz_hwi (mask);
+
+ operands[6] = GEN_INT (trailing);
+ operands[7] = GEN_INT (mask >> trailing);
- operands[6] = GEN_INT (trailing);
- operands[7] = GEN_INT (mask >> trailing);
+ /* This splits after reload, so there's little chance to clean things
+ up. Rather than emit a ton of RTL here, we can just make a new
+ operand for that RHS and use it. For the case where the AND would
+ have been redundant, we can make it a NOP move, which does get
+ cleaned up. */
+ if (operands[7] == CONSTM1_RTX (word_mode))
+ operands[8] = operands[4];
+ else
+ operands[8] = gen_rtx_AND (word_mode, operands[4], operands[7]);
}
[(set_attr "type" "branch")])
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 7515c8e..9e471be 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -466,6 +466,10 @@ Mask(XCVBI) Var(riscv_xcv_subext)
TargetVariable
int riscv_sv_subext
+Mask(SVADE) Var(riscv_sv_subext)
+
+Mask(SVADU) Var(riscv_sv_subext)
+
Mask(SVINVAL) Var(riscv_sv_subext)
Mask(SVNAPOT) Var(riscv_sv_subext)
@@ -579,6 +583,10 @@ Inline strlen calls if possible.
Target RejectNegative Joined UInteger Var(riscv_strcmp_inline_limit) Init(64)
Max number of bytes to compare as part of inlined strcmp/strncmp routines (default: 64).
+-param=gpr2vr-cost=
+Target RejectNegative Joined UInteger Var(gpr2vr_cost) Init(GPR2VR_COST_UNPROVIDED)
+Set the cost value of the rvv instruction when operate from GPR to VR.
+
Enum
Name(rvv_max_lmul) Type(enum rvv_max_lmul_enum)
The RVV possible LMUL (-mrvv-max-lmul=):
diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md
index b4c86909..eae3340 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -4041,6 +4041,10 @@
smax umax smin umin mult div udiv mod umod
])
+(define_code_iterator any_int_binop_no_shift_vx [
+ plus
+])
+
(define_code_iterator any_int_unop [neg not])
(define_code_iterator any_commutative_binop [plus and ior xor