diff options
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/i386/i386-features.cc | 39 | ||||
-rw-r--r-- | gcc/config/i386/i386.cc | 23 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 4 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-costs.h | 133 | ||||
-rw-r--r-- | gcc/config/riscv/autovec-opt.md | 23 | ||||
-rw-r--r-- | gcc/config/riscv/bitmanip.md | 74 | ||||
-rw-r--r-- | gcc/config/riscv/predicates.md | 4 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-opts.h | 2 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-protos.h | 1 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-vector-costs.cc | 2 | ||||
-rw-r--r-- | gcc/config/riscv/riscv.cc | 54 | ||||
-rw-r--r-- | gcc/config/riscv/riscv.md | 20 | ||||
-rw-r--r-- | gcc/config/riscv/riscv.opt | 8 | ||||
-rw-r--r-- | gcc/config/riscv/vector-iterators.md | 4 |
14 files changed, 378 insertions, 13 deletions
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 31f3ee2..1ba5ac4 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3309,8 +3309,16 @@ ix86_get_vector_load_mode (unsigned int size) mode = V64QImode; else if (size == 32) mode = V32QImode; - else + else if (size == 16) mode = V16QImode; + else if (size == 8) + mode = V8QImode; + else if (size == 4) + mode = V4QImode; + else if (size == 2) + mode = V2QImode; + else + gcc_unreachable (); return mode; } @@ -3338,13 +3346,36 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const, if (SUBREG_P (dest) || mode == vector_mode) replace = vector_const; else - replace = gen_rtx_SUBREG (mode, vector_const, 0); + { + unsigned int size = GET_MODE_SIZE (mode); + if (size < ix86_regmode_natural_size (mode)) + { + /* If the mode size is smaller than its natural size, + first insert an extra move with a QI vector SUBREG + of the same size to avoid validate_subreg failure. */ + machine_mode vmode = ix86_get_vector_load_mode (size); + rtx vreg; + if (mode == vmode) + vreg = vector_const; + else + { + vreg = gen_reg_rtx (vmode); + rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0); + rtx pat = gen_rtx_SET (vreg, vsubreg); + rtx_insn *vinsn = emit_insn_before (pat, insn); + df_insn_rescan (vinsn); + } + replace = gen_rtx_SUBREG (mode, vreg, 0); + } + else + replace = gen_rtx_SUBREG (mode, vector_const, 0); + } - /* NB: Don't run recog_memoized here since vector SUBREG may not - be valid. Let LRA handle vector SUBREG. */ SET_SRC (set) = replace; /* Drop possible dead definitions. */ PATTERN (insn) = set; + INSN_CODE (insn) = -1; + recog_memoized (insn); df_insn_rescan (insn); } } diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index f28c92a..bef95ea 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -12320,6 +12320,7 @@ get_thread_pointer (machine_mode tp_mode, bool to_reg) static GTY(()) rtx ix86_tls_index_symbol; +#if TARGET_WIN32_TLS static rtx ix86_tls_index (void) { @@ -12331,6 +12332,7 @@ ix86_tls_index (void) else return ix86_tls_index_symbol; } +#endif /* Construct the SYMBOL_REF for the tls_get_addr function. */ @@ -22792,6 +22794,27 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, else *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode)); return false; + case FLOAT: + case UNSIGNED_FLOAT: + if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* TODO: We do not have cost tables for x87. */ + *total = cost->fadd; + else if (VECTOR_MODE_P (mode)) + *total = ix86_vec_cost (mode, cost->cvtpi2ps); + else + *total = cost->cvtsi2ss; + return false; + + case FIX: + case UNSIGNED_FIX: + if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* TODO: We do not have cost tables for x87. */ + *total = cost->fadd; + else if (VECTOR_MODE_P (mode)) + *total = ix86_vec_cost (mode, cost->cvtps2pi); + else + *total = cost->cvtss2si; + return false; case ABS: /* SSE requires memory load for the constant operand. It may make diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 02bf357..6a38de3 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -213,6 +213,10 @@ struct processor_costs { such as VCVTPD2PS with larger reg in ymm. */ const int vcvtps2pd512; /* cost 512bit packed FP conversions, such as VCVTPD2PS with larger reg in zmm. */ + const int cvtsi2ss; /* cost of CVTSI2SS instruction. */ + const int cvtss2si; /* cost of CVT(T)SS2SI instruction. */ + const int cvtpi2ps; /* cost of CVTPI2PS instruction. */ + const int cvtps2pi; /* cost of CVT(T)PS2PI instruction. */ const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp; /* Specify reassociation width for integer, fp, vector integer and vector fp diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index cddcf61..6cce70a 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -134,6 +134,11 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (4), /* cost of CVTSS2SD etc. */ COSTS_N_BYTES (4), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_BYTES (6), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_BYTES (4), /* cost of CVTSI2SS instruction. */ + COSTS_N_BYTES (4), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_BYTES (4), /* cost of CVTPI2PS instruction. */ + COSTS_N_BYTES (4), /* cost of CVT(T)PS2PI instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ ix86_size_memcpy, ix86_size_memset, @@ -249,6 +254,10 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (27), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (54), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (108), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (27), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (27), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ i386_memcpy, i386_memset, @@ -365,6 +374,10 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (8), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (16), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (32), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (27), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (27), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ i486_memcpy, i486_memset, @@ -479,6 +492,10 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ pentium_memcpy, pentium_memset, @@ -586,6 +603,10 @@ struct processor_costs lakemont_cost = { COSTS_N_INSNS (5), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (10), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (20), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (5), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (5), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (5), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ pentium_memcpy, pentium_memset, @@ -708,6 +729,10 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ pentiumpro_memcpy, pentiumpro_memset, @@ -821,6 +846,10 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (6), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (12), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (24), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ geode_memcpy, geode_memset, @@ -937,6 +966,10 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (4), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (8), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (2), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (2), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (2), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (2), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ k6_memcpy, k6_memset, @@ -1054,6 +1087,10 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (8), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (16), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (4), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ athlon_memcpy, athlon_memset, @@ -1180,6 +1217,10 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (8), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (16), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (10), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ k8_memcpy, k8_memset, @@ -1314,6 +1355,10 @@ struct processor_costs amdfam10_cost = { COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (8), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (16), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (8), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ amdfam10_memcpy, amdfam10_memset, @@ -1441,6 +1486,10 @@ const struct processor_costs bdver_cost = { COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (7), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (14), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (13), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ bdver_memcpy, bdver_memset, @@ -1593,6 +1642,10 @@ struct processor_costs znver1_cost = { /* Real latency is 4, but for split regs multiply cost of half op by 2. */ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (8), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (7), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles and it can execute 2 integer additions and 2 multiplications thus reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests @@ -1755,6 +1808,10 @@ struct processor_costs znver2_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (10), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (7), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles and it can execute 2 integer additions and 2 multiplications thus reassociation may make sense up to with of 6. @@ -1893,6 +1950,10 @@ struct processor_costs znver3_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (10), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles and it can execute 2 integer additions and 2 multiplications thus reassociation may make sense up to with of 6. @@ -2034,6 +2095,10 @@ struct processor_costs znver4_cost = { COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */ /* Real latency is 6, but for split regs multiply cost of half op by 2. */ COSTS_N_INSNS (10), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles and it can execute 2 integer additions and 2 multiplications thus reassociation may make sense up to with of 6. @@ -2188,6 +2253,10 @@ struct processor_costs znver5_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (5), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ /* Zen5 can execute: - integer ops: 6 per cycle, at most 3 multiplications. latency 1 for additions, 3 for multiplications (pipelined) @@ -2330,6 +2399,10 @@ struct processor_costs skylake_cost = { COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (4), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ skylake_memcpy, skylake_memset, @@ -2462,6 +2535,10 @@ struct processor_costs icelake_cost = { COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (2), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ icelake_memcpy, icelake_memset, @@ -2588,6 +2665,10 @@ struct processor_costs alderlake_cost = { COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (2), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ alderlake_memcpy, alderlake_memset, @@ -2707,6 +2788,10 @@ const struct processor_costs btver1_cost = { COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (7), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (14), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (13), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ btver1_memcpy, btver1_memset, @@ -2823,6 +2908,10 @@ const struct processor_costs btver2_cost = { COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (7), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (14), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (14), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (13), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ btver2_memcpy, btver2_memset, @@ -2938,6 +3027,10 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (10), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (20), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (40), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (20), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (17), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ pentium4_memcpy, pentium4_memset, @@ -3056,6 +3149,10 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (10), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (20), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (40), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (20), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (17), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ nocona_memcpy, nocona_memset, @@ -3172,6 +3269,10 @@ struct processor_costs atom_cost = { COSTS_N_INSNS (6), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (12), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (24), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (7), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (10), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ atom_memcpy, atom_memset, @@ -3288,6 +3389,10 @@ struct processor_costs slm_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (5), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (5), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ slm_memcpy, slm_memset, @@ -3418,6 +3523,10 @@ struct processor_costs tremont_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ tremont_memcpy, tremont_memset, @@ -3534,6 +3643,10 @@ struct processor_costs intel_cost = { COSTS_N_INSNS (8), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (16), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (32), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (8), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (8), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (8), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ intel_memcpy, intel_memset, @@ -3655,6 +3768,10 @@ struct processor_costs lujiazui_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ lujiazui_memcpy, lujiazui_memset, @@ -3774,6 +3891,10 @@ struct processor_costs yongfeng_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */ yongfeng_memcpy, yongfeng_memset, @@ -3893,6 +4014,10 @@ struct processor_costs shijidadao_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (3), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */ shijidadao_memcpy, shijidadao_memset, @@ -4020,6 +4145,10 @@ struct processor_costs generic_cost = { COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (4), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (5), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ generic_memcpy, generic_memset, @@ -4152,6 +4281,10 @@ struct processor_costs core_cost = { COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */ COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */ COSTS_N_INSNS (2), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ core_memcpy, core_memset, diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md index 0c3b0cc..7cf7e8a 100644 --- a/gcc/config/riscv/autovec-opt.md +++ b/gcc/config/riscv/autovec-opt.md @@ -1673,3 +1673,26 @@ DONE; } [(set_attr "type" "vandn")]) + + +;; ============================================================================= +;; Combine vec_duplicate + op.vv to op.vx +;; Include +;; - vadd.vx +;; ============================================================================= +(define_insn_and_split "*<optab>_vx_<mode>" + [(set (match_operand:V_VLSI 0 "register_operand") + (any_int_binop_no_shift_vx:V_VLSI + (vec_duplicate:V_VLSI + (match_operand:<VEL> 1 "register_operand")) + (match_operand:V_VLSI 2 "<binop_rhs2_predicate>")))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + rtx ops[] = {operands[0], operands[2], operands[1]}; + riscv_vector::emit_vlmax_insn (code_for_pred_scalar (<CODE>, <MODE>mode), + riscv_vector::BINARY_OP, ops); + } + [(set_attr "type" "vialu")]) diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md index 20d03dc..95df533 100644 --- a/gcc/config/riscv/bitmanip.md +++ b/gcc/config/riscv/bitmanip.md @@ -1302,3 +1302,77 @@ } DONE; }) + +;; More forms of single bit extraction. The RISC-V port does not +;; define SHIFT_COUNT_TRUNCATED so we need forms where the bit position +;; is masked. +;; +;; We could in theory use this for rv32 as well, but it probably does +;; not occur in practice. The bit position would need to be QI/HI mode, +;; otherwise we would not need the zero extension. +;; +;; One could also argue that the zero extension is redundant and should +;; have been optimized away during RTL simplification. +(define_insn "*bextdi_position_ze_masked" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extract:DI (match_operand:DI 1 "register_operand" "r") + (const_int 1) + (zero_extend:DI + (and:SI (match_operand:SI 2 "register_operand" "r") + (const_int 63)))))] + "TARGET_64BIT && TARGET_ZBS" + "bext\t%0,%1,%2" + [(set_attr "type" "bitmanip")]) + +;; Same as above, but without the extraneous zero_extend. +(define_insn "*bextdi_position_ze_masked" + [(set (match_operand:X 0 "register_operand" "=r") + (zero_extract:X + (match_operand:X 1 "register_operand" "r") + (const_int 1) + (and:X (match_operand:SI 2 "register_operand" "r") + (match_operand:SI 3 "bitpos_mask_operand" "n"))))] + "TARGET_64BIT && TARGET_ZBS" + "bext\t%0,%1,%2" + [(set_attr "type" "bitmanip")]) + + +;; Single bit extraction by first shifting it into the sign bit, then +;; shifting it down to the low bit. +(define_insn "*bext<mode>_position_masked" + [(set (match_operand:X 0 "register_operand" "=r") + (lshiftrt:X (ashift:X (match_operand:X 1 "register_operand" "r") + (match_operand:QI 2 "register_operand" "r")) + (match_operand:X 3 "bitpos_mask_operand" "n")))] + "TARGET_ZBS" + "bext\t%0,%1,%2" + [(set_attr "type" "bitmanip")]) + +;; Single bit extraction by shifting into the low bit, but with the +;; position formed with a subreg of a mask. +(define_insn "*bext<mode>_position_masked_subreg" + [(set (match_operand:X 0 "register_operand" "=r") + (lshiftrt:X + (ashift:X (match_operand:X 1 "register_operand" "r") + (subreg:QI + (and:X (match_operand:X 2 "register_operand" "r") + (match_operand:X 3 "bitpos_mask_operand" "n")) 0)) + (match_operand:X 4 "bitpos_mask_operand" "n")))] + "TARGET_ZBS" + "bext\t%0,%1,%2" + [(set_attr "type" "bitmanip")]) + +;; This has shown up in testing. In particular we end up with an +;; immediate input. We can load that into a register and target +;; one of the above bext patterns. +(define_split + [(set (match_operand:X 0 "register_operand") + (and:X (lshiftrt:X (match_operand 1 "immediate_operand") + (match_operand:QI 2 "register_operand")) + (const_int 1))) + (clobber (match_operand:X 3 "register_operand"))] + "" + [(set (match_dup 3) (match_dup 1)) + (set (match_dup 0) (zero_extract:X (match_dup 3) + (const_int 1) + (zero_extend:X (match_dup 2))))]) diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index f26bafc..c9a638c 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -685,3 +685,7 @@ (and (match_operand 0 "register_operand") (match_test "REGNO (op) == RETURN_ADDR_REGNUM || REGNO (op) == T0_REGNUM"))) + +(define_predicate "bitpos_mask_operand" + (and (match_code "const_int") + (match_test "TARGET_64BIT ? INTVAL (op) == 63 : INTVAL (op) == 31"))) diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h index 26fe228..9766b89 100644 --- a/gcc/config/riscv/riscv-opts.h +++ b/gcc/config/riscv/riscv-opts.h @@ -162,4 +162,6 @@ enum riscv_tls_type { #define TARGET_VECTOR_AUTOVEC_SEGMENT \ (TARGET_VECTOR && riscv_mautovec_segment) +#define GPR2VR_COST_UNPROVIDED -1 + #endif /* ! GCC_RISCV_OPTS_H */ diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 2e88990..b0d5bbb 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -836,6 +836,7 @@ struct riscv_tune_info { const struct riscv_tune_info * riscv_parse_tune (const char *, bool); const cpu_vector_cost *get_vector_costs (); +int get_gr2vr_cost (); enum { diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 167375c..c28eecd 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1121,7 +1121,7 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, { case scalar_to_vec: stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR - : costs->regmove->GR2VR); + : get_gr2vr_cost ()); break; case vec_to_scalar: stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index a065732..3ee88db 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -3863,7 +3863,40 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN Cost Model need to be well analyzed and supported in the future. */ if (riscv_v_ext_mode_p (mode)) { - *total = COSTS_N_INSNS (1); + int gr2vr_cost = get_gr2vr_cost (); + + switch (outer_code) + { + case SET: + { + switch (GET_CODE (x)) + { + case VEC_DUPLICATE: + *total = gr2vr_cost * COSTS_N_INSNS (1); + break; + case PLUS: + { + rtx op_0 = XEXP (x, 0); + rtx op_1 = XEXP (x, 1); + + if (GET_CODE (op_0) == VEC_DUPLICATE + || GET_CODE (op_1) == VEC_DUPLICATE) + *total = (gr2vr_cost + 1) * COSTS_N_INSNS (1); + else + *total = COSTS_N_INSNS (1); + } + break; + default: + *total = COSTS_N_INSNS (1); + break; + } + } + break; + default: + *total = COSTS_N_INSNS (1); + break; + } + return true; } @@ -9690,7 +9723,7 @@ riscv_register_move_cost (machine_mode mode, if (to == V_REGS) { if (from_is_gpr) - return get_vector_costs ()->regmove->GR2VR; + return get_gr2vr_cost (); else if (from_is_fpr) return get_vector_costs ()->regmove->FR2VR; } @@ -12540,6 +12573,21 @@ get_vector_costs () return costs; } +/* Return the cost of operation that move from gpr to vr. + It will take the value of --param=gpr2vr_cost if it is provided. + Or the default regmove->GR2VR will be returned. */ + +int +get_gr2vr_cost () +{ + int cost = get_vector_costs ()->regmove->GR2VR; + + if (gpr2vr_cost != GPR2VR_COST_UNPROVIDED) + cost = gpr2vr_cost; + + return cost; +} + /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int @@ -12606,7 +12654,7 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, { /* TODO: This is too pessimistic in case we can splat. */ int regmove_cost = fp ? costs->regmove->FR2VR - : costs->regmove->GR2VR; + : get_gr2vr_cost (); return (regmove_cost + common_costs->scalar_to_vec_cost) * estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)); } diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 15c89ff..259997f 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -3173,15 +3173,25 @@ "#" "&& reload_completed" [(set (match_dup 4) (lshiftrt:X (subreg:X (match_dup 2) 0) (match_dup 6))) - (set (match_dup 4) (and:X (match_dup 4) (match_dup 7))) + (set (match_dup 4) (match_dup 8)) (set (pc) (if_then_else (match_op_dup 1 [(match_dup 4) (const_int 0)]) (label_ref (match_dup 0)) (pc)))] { - HOST_WIDE_INT mask = INTVAL (operands[3]); - int trailing = ctz_hwi (mask); + HOST_WIDE_INT mask = INTVAL (operands[3]); + int trailing = ctz_hwi (mask); + + operands[6] = GEN_INT (trailing); + operands[7] = GEN_INT (mask >> trailing); - operands[6] = GEN_INT (trailing); - operands[7] = GEN_INT (mask >> trailing); + /* This splits after reload, so there's little chance to clean things + up. Rather than emit a ton of RTL here, we can just make a new + operand for that RHS and use it. For the case where the AND would + have been redundant, we can make it a NOP move, which does get + cleaned up. */ + if (operands[7] == CONSTM1_RTX (word_mode)) + operands[8] = operands[4]; + else + operands[8] = gen_rtx_AND (word_mode, operands[4], operands[7]); } [(set_attr "type" "branch")]) diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt index 7515c8e..9e471be 100644 --- a/gcc/config/riscv/riscv.opt +++ b/gcc/config/riscv/riscv.opt @@ -466,6 +466,10 @@ Mask(XCVBI) Var(riscv_xcv_subext) TargetVariable int riscv_sv_subext +Mask(SVADE) Var(riscv_sv_subext) + +Mask(SVADU) Var(riscv_sv_subext) + Mask(SVINVAL) Var(riscv_sv_subext) Mask(SVNAPOT) Var(riscv_sv_subext) @@ -579,6 +583,10 @@ Inline strlen calls if possible. Target RejectNegative Joined UInteger Var(riscv_strcmp_inline_limit) Init(64) Max number of bytes to compare as part of inlined strcmp/strncmp routines (default: 64). +-param=gpr2vr-cost= +Target RejectNegative Joined UInteger Var(gpr2vr_cost) Init(GPR2VR_COST_UNPROVIDED) +Set the cost value of the rvv instruction when operate from GPR to VR. + Enum Name(rvv_max_lmul) Type(enum rvv_max_lmul_enum) The RVV possible LMUL (-mrvv-max-lmul=): diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md index b4c86909..eae3340 100644 --- a/gcc/config/riscv/vector-iterators.md +++ b/gcc/config/riscv/vector-iterators.md @@ -4041,6 +4041,10 @@ smax umax smin umin mult div udiv mod umod ]) +(define_code_iterator any_int_binop_no_shift_vx [ + plus +]) + (define_code_iterator any_int_unop [neg not]) (define_code_iterator any_commutative_binop [plus and ior xor |