diff options
Diffstat (limited to 'gcc/config/riscv')
-rw-r--r-- | gcc/config/riscv/autovec-opt.md | 31 | ||||
-rw-r--r-- | gcc/config/riscv/autovec.md | 7 | ||||
-rw-r--r-- | gcc/config/riscv/gen-riscv-mcpu-texi.cc | 43 | ||||
-rw-r--r-- | gcc/config/riscv/gen-riscv-mtune-texi.cc | 41 | ||||
-rw-r--r-- | gcc/config/riscv/generic-vector-ooo.md | 85 | ||||
-rw-r--r-- | gcc/config/riscv/predicates.md | 13 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-ext.def | 30 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-protos.h | 16 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-string.cc | 6 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-v.cc | 302 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-vector-builtins-bases.cc | 3 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-vector-builtins.cc | 41 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-vector-builtins.h | 1 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-vector-costs.cc | 71 | ||||
-rw-r--r-- | gcc/config/riscv/riscv-vector-costs.h | 16 | ||||
-rw-r--r-- | gcc/config/riscv/riscv.cc | 60 | ||||
-rw-r--r-- | gcc/config/riscv/riscv.md | 10 | ||||
-rw-r--r-- | gcc/config/riscv/t-riscv | 37 | ||||
-rw-r--r-- | gcc/config/riscv/vector-iterators.md | 16 | ||||
-rw-r--r-- | gcc/config/riscv/vector.md | 380 |
20 files changed, 864 insertions, 345 deletions
diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md index d884942..6531996 100644 --- a/gcc/config/riscv/autovec-opt.md +++ b/gcc/config/riscv/autovec-opt.md @@ -1714,7 +1714,7 @@ } [(set_attr "type" "vialu")]) -(define_insn_and_split "*uavg_floor_vx_<mode>" +(define_insn_and_split "*<sat_op_v_vdup>_vx_<mode>" [(set (match_operand:V_VLSI 0 "register_operand") (if_then_else:V_VLSI (unspec:<VM> @@ -1730,7 +1730,7 @@ (unspec:V_VLSI [(match_operand:V_VLSI 3 "register_operand") (vec_duplicate:V_VLSI - (match_operand:<VEL> 4 "register_operand"))] UNSPEC_VAADDU) + (match_operand:<VEL> 4 "reg_or_int_operand"))] VSAT_VX_OP_V_VDUP) (unspec:V_VLSI [(match_operand:DI 2 "register_operand")] UNSPEC_VUNDEF)))] "TARGET_VECTOR && can_create_pseudo_p ()" @@ -1738,14 +1738,17 @@ "&& 1" [(const_int 0)] { - insn_code code = code_for_pred_scalar (UNSPEC_VAADDU, <MODE>mode); - rtx ops[] = {operands[0], operands[3], operands[4]}; - riscv_vector::emit_vlmax_insn (code, riscv_vector::BINARY_OP_VXRM_RDN, ops); + int vxrm_val = INTVAL (operands[9]); + riscv_vector::expand_vx_binary_vxrm_vec_vec_dup (operands[0], operands[3], + operands[4], + <VSAT_VX_OP_V_VDUP>, + vxrm_val, <MODE>mode); + DONE; } [(set_attr "type" "vaalu")]) -(define_insn_and_split "*uavg_floor_vx_<mode>" +(define_insn_and_split "*<sat_op_vdup_v>_vx_<mode>" [(set (match_operand:V_VLSI 0 "register_operand") (if_then_else:V_VLSI (unspec:<VM> @@ -1760,8 +1763,8 @@ (reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE) (unspec:V_VLSI [(vec_duplicate:V_VLSI - (match_operand:<VEL> 4 "register_operand")) - (match_operand:V_VLSI 3 "register_operand")] UNSPEC_VAADDU) + (match_operand:<VEL> 4 "reg_or_int_operand")) + (match_operand:V_VLSI 3 "register_operand")] VSAT_VX_OP_VDUP_V) (unspec:V_VLSI [(match_operand:DI 2 "register_operand")] UNSPEC_VUNDEF)))] "TARGET_VECTOR && can_create_pseudo_p ()" @@ -1769,9 +1772,12 @@ "&& 1" [(const_int 0)] { - insn_code code = code_for_pred_scalar (UNSPEC_VAADDU, <MODE>mode); - rtx ops[] = {operands[0], operands[3], operands[4]}; - riscv_vector::emit_vlmax_insn (code, riscv_vector::BINARY_OP_VXRM_RDN, ops); + int vxrm_val = INTVAL (operands[9]); + riscv_vector::expand_vx_binary_vxrm_vec_dup_vec (operands[0], operands[3], + operands[4], + <VSAT_VX_OP_VDUP_V>, + vxrm_val, <MODE>mode); + DONE; } [(set_attr "type" "vaalu")]) @@ -1900,8 +1906,7 @@ emit_insn (gen_extend<vsubel><vel>2(tmp, operands[1])); rtx ops[] = {operands[0], tmp}; - riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), - riscv_vector::UNARY_OP, ops); + riscv_vector::expand_broadcast (<MODE>mode, ops); DONE; } [(set_attr "type" "vfwmuladd")] diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md index 1fff8ac..48de5ef 100644 --- a/gcc/config/riscv/autovec.md +++ b/gcc/config/riscv/autovec.md @@ -1359,9 +1359,7 @@ if (operands[2] == const0_rtx) { rtx ops[] = {operands[0], operands[0], operands[1]}; - riscv_vector::emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode), - riscv_vector::SCALAR_MOVE_MERGED_OP_TU, - ops, CONST1_RTX (Pmode)); + riscv_vector::expand_set_first_tu (<MODE>mode, ops); } else { @@ -1385,8 +1383,7 @@ VL we need for the slide. */ rtx tmp = gen_reg_rtx (<MODE>mode); rtx ops1[] = {tmp, operands[1]}; - emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode), - riscv_vector::UNARY_OP, ops1, length); + riscv_vector::expand_broadcast (<MODE>mode, ops1, length); /* Slide exactly one element up leaving the tail elements unchanged. */ diff --git a/gcc/config/riscv/gen-riscv-mcpu-texi.cc b/gcc/config/riscv/gen-riscv-mcpu-texi.cc new file mode 100644 index 0000000..9681438 --- /dev/null +++ b/gcc/config/riscv/gen-riscv-mcpu-texi.cc @@ -0,0 +1,43 @@ +#include <string> +#include <vector> +#include <stdio.h> + +int +main () +{ + puts ("@c Copyright (C) 2025 Free Software Foundation, Inc."); + puts ("@c This is part of the GCC manual."); + puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi."); + puts (""); + puts ("@c This file is generated automatically using"); + puts ("@c gcc/config/riscv/gen-riscv-mcpu-texi.cc from:"); + puts ("@c gcc/config/riscv/riscv-cores.def"); + puts (""); + puts ("@c Please *DO NOT* edit manually."); + puts (""); + puts ("@samp{Core Name}"); + puts (""); + puts ("@opindex mcpu"); + puts ("@item -mcpu=@var{processor-string}"); + puts ("Use architecture of and optimize the output for the given processor, specified"); + puts ("by particular CPU name. Permissible values for this option are:"); + puts (""); + puts (""); + + std::vector<std::string> coreNames; + +#define RISCV_CORE(CORE_NAME, ARCH, MICRO_ARCH) \ + coreNames.push_back (CORE_NAME); +#include "riscv-cores.def" +#undef RISCV_CORE + + for (size_t i = 0; i < coreNames.size(); ++i) { + if (i == coreNames.size() - 1) { + printf("@samp{%s}.\n", coreNames[i].c_str()); + } else { + printf("@samp{%s},\n\n", coreNames[i].c_str()); + } + } + + return 0; +} diff --git a/gcc/config/riscv/gen-riscv-mtune-texi.cc b/gcc/config/riscv/gen-riscv-mtune-texi.cc new file mode 100644 index 0000000..1bdfe2a --- /dev/null +++ b/gcc/config/riscv/gen-riscv-mtune-texi.cc @@ -0,0 +1,41 @@ +#include <string> +#include <vector> +#include <stdio.h> + +int +main () +{ + puts ("@c Copyright (C) 2025 Free Software Foundation, Inc."); + puts ("@c This is part of the GCC manual."); + puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi."); + puts (""); + puts ("@c This file is generated automatically using"); + puts ("@c gcc/config/riscv/gen-riscv-mtune-texi.cc from:"); + puts ("@c gcc/config/riscv/riscv-cores.def"); + puts (""); + puts ("@c Please *DO NOT* edit manually."); + puts (""); + puts ("@samp{Tune Name}"); + puts (""); + puts ("@opindex mtune"); + puts ("@item -mtune=@var{processor-string}"); + puts ("Optimize the output for the given processor, specified by microarchitecture or"); + puts ("particular CPU name. Permissible values for this option are:"); + puts (""); + puts (""); + + std::vector<std::string> tuneNames; + +#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO) \ + tuneNames.push_back (TUNE_NAME); +#include "riscv-cores.def" +#undef RISCV_TUNE + + for (size_t i = 0; i < tuneNames.size(); ++i) { + printf("@samp{%s},\n\n", tuneNames[i].c_str()); + } + + puts ("and all valid options for @option{-mcpu=}."); + + return 0; +} diff --git a/gcc/config/riscv/generic-vector-ooo.md b/gcc/config/riscv/generic-vector-ooo.md index ab9e57f..773003b 100644 --- a/gcc/config/riscv/generic-vector-ooo.md +++ b/gcc/config/riscv/generic-vector-ooo.md @@ -17,6 +17,9 @@ ;; <http://www.gnu.org/licenses/>. ;; Vector load/store +;; The insn reservations include "generic" as we won't have a in-order +;; generic definition for vector instructions. + (define_automaton "vector_ooo") ;; Separate issue queue for vector instructions. @@ -29,119 +32,141 @@ (define_cpu_unit "vxu_ooo_multicycle" "vector_ooo") (define_insn_reservation "vec_load" 6 - (eq_attr "type" "vlde,vldm,vlds,vldux,vldox,vldff,vldr") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vlde,vldm,vlds,vldux,vldox,vldff,vldr")) "vxu_ooo_issue,vxu_ooo_alu") (define_insn_reservation "vec_store" 6 - (eq_attr "type" "vste,vstm,vsts,vstux,vstox,vstr") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vste,vstm,vsts,vstux,vstox,vstr")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector segment loads/stores. (define_insn_reservation "vec_loadstore_seg" 10 - (eq_attr "type" "vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\ - vssegte,vssegts,vssegtux,vssegtox") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\ + vssegte,vssegts,vssegtux,vssegtox")) "vxu_ooo_issue,vxu_ooo_alu") ;; Regular vector operations and integer comparisons. (define_insn_reservation "vec_alu" 3 - (eq_attr "type" "vialu,viwalu,vext,vicalu,vshift,vnshift,viminmax,vicmp,\ - vimov,vsalu,vaalu,vsshift,vnclip,vmov,vfmov,vector,\ - vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,vror,vwsll") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vialu,viwalu,vext,vicalu,vshift,vnshift,viminmax,vicmp,\ + vimov,vsalu,vaalu,vsshift,vnclip,vmov,vfmov,vector,\ + vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,vror,vwsll")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector float comparison, conversion etc. (define_insn_reservation "vec_fcmp" 3 - (eq_attr "type" "vfrecp,vfminmax,vfcmp,vfsgnj,vfclass,vfcvtitof,\ - vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,\ - vfncvtftoi,vfncvtftof,vfncvtbf16,vfwcvtbf16") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfrecp,vfminmax,vfcmp,vfsgnj,vfclass,vfcvtitof,\ + vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,\ + vfncvtftoi,vfncvtftof,vfncvtbf16,vfwcvtbf16")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector integer multiplication. (define_insn_reservation "vec_imul" 4 - (eq_attr "type" "vimul,viwmul,vimuladd,viwmuladd,vsmul,vclmul,vclmulh,\ - vghsh,vgmul") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vimul,viwmul,vimuladd,viwmuladd,vsmul,vclmul,vclmulh,\ + vghsh,vgmul")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector float addition. (define_insn_reservation "vec_fadd" 4 - (eq_attr "type" "vfalu,vfwalu") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfalu,vfwalu")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector float multiplication and FMA. (define_insn_reservation "vec_fmul" 6 - (eq_attr "type" "vfmul,vfwmul,vfmuladd,vfwmuladd,vfwmaccbf16,sf_vqmacc,sf_vfnrclip") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfmul,vfwmul,vfmuladd,vfwmuladd,vfwmaccbf16,sf_vqmacc,sf_vfnrclip")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector crypto, assumed to be a generic operation for now. (define_insn_reservation "vec_crypto" 4 - (eq_attr "type" "crypto,vclz,vctz,vcpop") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "crypto,vclz,vctz,vcpop")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector crypto, AES (define_insn_reservation "vec_crypto_aes" 4 - (eq_attr "type" "vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector crypto, sha (define_insn_reservation "vec_crypto_sha" 4 - (eq_attr "type" "vsha2ms,vsha2ch,vsha2cl") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vsha2ms,vsha2ch,vsha2cl")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector crypto, SM3/4 (define_insn_reservation "vec_crypto_sm" 4 - (eq_attr "type" "vsm4k,vsm4r,vsm3me,vsm3c") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vsm4k,vsm4r,vsm3me,vsm3c")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector permute. (define_insn_reservation "vec_perm" 3 - (eq_attr "type" "vimerge,vfmerge,vslideup,vslidedown,vislide1up,\ - vislide1down,vfslide1up,vfslide1down,vgather,vcompress") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vimerge,vfmerge,vslideup,vslidedown,vislide1up,\ + vislide1down,vfslide1up,vfslide1down,vgather,vcompress")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector reduction. (define_insn_reservation "vec_reduction" 8 - (eq_attr "type" "vired,viwred,vfredu,vfwredu") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vired,viwred,vfredu,vfwredu")) "vxu_ooo_issue,vxu_ooo_multicycle") ;; Vector ordered reduction, assume the latency number is for ;; a 128-bit vector. It is scaled in riscv_sched_adjust_cost ;; for larger vectors. (define_insn_reservation "vec_ordered_reduction" 10 - (eq_attr "type" "vfredo,vfwredo") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfredo,vfwredo")) "vxu_ooo_issue,vxu_ooo_multicycle*3") ;; Vector integer division, assume not pipelined. (define_insn_reservation "vec_idiv" 16 - (eq_attr "type" "vidiv") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vidiv")) "vxu_ooo_issue,vxu_ooo_multicycle*3") ;; Vector float divisions and sqrt, assume not pipelined. (define_insn_reservation "vec_float_divsqrt" 16 - (eq_attr "type" "vfdiv,vfsqrt") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vfdiv,vfsqrt")) "vxu_ooo_issue,vxu_ooo_multicycle*3") ;; Vector mask operations. (define_insn_reservation "vec_mask" 2 - (eq_attr "type" "vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,\ - vfmovvf,vfmovfv") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,\ + vfmovvf,vfmovfv")) "vxu_ooo_issue,vxu_ooo_alu") ;; Vector vsetvl. (define_insn_reservation "vec_vesetvl" 1 - (eq_attr "type" "vsetvl,vsetvl_pre") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "vsetvl,vsetvl_pre")) "vxu_ooo_issue") ;; Vector rounding mode setters, assume pipeline barrier. (define_insn_reservation "vec_setrm" 20 - (eq_attr "type" "wrvxrm,wrfrm") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "wrvxrm,wrfrm")) "vxu_ooo_issue,vxu_ooo_issue*3") ;; Vector read vlen/vlenb. (define_insn_reservation "vec_readlen" 4 - (eq_attr "type" "rdvlenb,rdvl") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "rdvlenb,rdvl")) "vxu_ooo_issue,vxu_ooo_issue") ;; Vector sf_vcp. (define_insn_reservation "vec_sf_vcp" 2 - (eq_attr "type" "sf_vc,sf_vc_se") + (and (eq_attr "tune" "generic_ooo,generic") + (eq_attr "type" "sf_vc,sf_vc_se")) "vxu_ooo_issue") diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index 1f9a6b5..381f96c 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -518,6 +518,10 @@ (define_predicate "vector_broadcast_mask_operand" (ior (match_operand 0 "vector_least_significant_set_mask_operand") + (match_operand 0 "vector_all_trues_mask_operand"))) + +(define_predicate "strided_broadcast_mask_operand" + (ior (match_operand 0 "vector_least_significant_set_mask_operand") (ior (match_operand 0 "register_operand") (match_operand 0 "vector_all_trues_mask_operand")))) @@ -619,6 +623,15 @@ (define_predicate "direct_broadcast_operand" (match_test "riscv_vector::can_be_broadcast_p (op)")) +;; A strided broadcast is just a fallback pattern that loads from +;; memory. +(define_predicate "strided_broadcast_operand" + (match_test "riscv_vector::strided_broadcast_p (op)")) + +(define_predicate "any_broadcast_operand" + (ior (match_operand 0 "direct_broadcast_operand") + (match_operand 0 "strided_broadcast_operand"))) + ;; A CONST_INT operand that has exactly two bits cleared. (define_predicate "const_nottwobits_operand" (and (match_code "const_int") diff --git a/gcc/config/riscv/riscv-ext.def b/gcc/config/riscv/riscv-ext.def index 6fc6d38..09f18ad 100644 --- a/gcc/config/riscv/riscv-ext.def +++ b/gcc/config/riscv/riscv-ext.def @@ -80,8 +80,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({}), /* SUPPORTED_VERSIONS */ ({{2, 0}}), /* FLAG_GROUP */ base, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 0, + /* BITMASK_BIT_POSITION*/ 4, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -190,8 +190,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({"zba", "zbb", "zbs"}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ base, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 0, + /* BITMASK_BIT_POSITION*/ 1, /* EXTRA_EXTENSION_FLAGS */ EXT_FLAG_MACRO) DEFINE_RISCV_EXT( @@ -216,8 +216,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ base, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 0, + /* BITMASK_BIT_POSITION*/ 7, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -398,8 +398,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({}), /* SUPPORTED_VERSIONS */ ({{2, 0}}), /* FLAG_GROUP */ zi, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 1, + /* BITMASK_BIT_POSITION*/ 11, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -464,7 +464,7 @@ DEFINE_RISCV_EXT( /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ zi, /* BITMASK_GROUP_ID */ 1, - /* BITMASK_BIT_POSITION*/ 1, + /* BITMASK_BIT_POSITION*/ 8, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -476,8 +476,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ zm, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 1, + /* BITMASK_BIT_POSITION*/ 12, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -787,8 +787,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({"zca"}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ zc, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 1, + /* BITMASK_BIT_POSITION*/ 10, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( @@ -813,8 +813,8 @@ DEFINE_RISCV_EXT( /* DEP_EXTS */ ({"zca", "zilsd"}), /* SUPPORTED_VERSIONS */ ({{1, 0}}), /* FLAG_GROUP */ zc, - /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED, - /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_GROUP_ID */ 1, + /* BITMASK_BIT_POSITION*/ 9, /* EXTRA_EXTENSION_FLAGS */ 0) DEFINE_RISCV_EXT( diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index a41c4c2..539321f 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -414,8 +414,14 @@ enum insn_flags : unsigned int /* Means INSN has VXRM operand and the value is VXRM_RNU. */ VXRM_RNU_P = 1 << 20, + /* Means INSN has VXRM operand and the value is VXRM_RNE. */ + VXRM_RNE_P = 1 << 21, + /* Means INSN has VXRM operand and the value is VXRM_RDN. */ - VXRM_RDN_P = 1 << 21, + VXRM_RDN_P = 1 << 22, + + /* Means INSN has VXRM operand and the value is VXRM_ROD. */ + VXRM_ROD_P = 1 << 23, }; enum insn_type : unsigned int @@ -477,7 +483,9 @@ enum insn_type : unsigned int BINARY_OP_TUMA = __MASK_OP_TUMA | BINARY_OP_P, BINARY_OP_FRM_DYN = BINARY_OP | FRM_DYN_P, BINARY_OP_VXRM_RNU = BINARY_OP | VXRM_RNU_P, + BINARY_OP_VXRM_RNE = BINARY_OP | VXRM_RNE_P, BINARY_OP_VXRM_RDN = BINARY_OP | VXRM_RDN_P, + BINARY_OP_VXRM_ROD = BINARY_OP | VXRM_ROD_P, /* Ternary operator. Always have real merge operand. */ TERNARY_OP = HAS_DEST_P | HAS_MASK_P | USE_ALL_TRUES_MASK_P | HAS_MERGE_P @@ -672,6 +680,8 @@ void expand_vec_oct_sstrunc (rtx, rtx, machine_mode, machine_mode, machine_mode); void expand_vx_binary_vec_dup_vec (rtx, rtx, rtx, rtx_code, machine_mode); void expand_vx_binary_vec_vec_dup (rtx, rtx, rtx, rtx_code, machine_mode); +void expand_vx_binary_vxrm_vec_vec_dup (rtx, rtx, rtx, int, int, machine_mode); +void expand_vx_binary_vxrm_vec_dup_vec (rtx, rtx, rtx, int, int, machine_mode); #endif bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode, bool, void (*)(rtx *, rtx), enum avl_type); @@ -695,6 +705,9 @@ bool expand_block_move (rtx, rtx, rtx, bool); machine_mode preferred_simd_mode (scalar_mode); machine_mode get_mask_mode (machine_mode); void expand_vec_series (rtx, rtx, rtx, rtx = 0); +void expand_broadcast (machine_mode, rtx *, rtx = 0); +void expand_set_first (machine_mode, rtx *, rtx = 0); +void expand_set_first_tu (machine_mode, rtx *, rtx = 0); void expand_vec_init (rtx, rtx); void expand_vec_perm (rtx, rtx, rtx, rtx); void expand_select_vl (rtx *); @@ -762,6 +775,7 @@ enum vlmul_type get_vlmul (rtx_insn *); int count_regno_occurrences (rtx_insn *, unsigned int); bool imm_avl_p (machine_mode); bool can_be_broadcast_p (rtx); +bool strided_broadcast_p (rtx); bool gather_scatter_valid_offset_p (machine_mode); HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int); bool whole_reg_to_reg_move_p (rtx *, machine_mode, int); diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc index 9080189..61c4a09 100644 --- a/gcc/config/riscv/riscv-string.cc +++ b/gcc/config/riscv/riscv-string.cc @@ -1625,16 +1625,14 @@ expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in) Otherwise, use a predicated store. */ if (known_eq (GET_MODE_SIZE (info.vmode), INTVAL (info.avl))) { - emit_vlmax_insn (code_for_pred_broadcast (info.vmode), UNARY_OP, - broadcast_ops); + riscv_vector::expand_broadcast (info.vmode, broadcast_ops); emit_move_insn (dst, fill_value); } else { if (!satisfies_constraint_vl (info.avl)) info.avl = force_reg (Pmode, info.avl); - emit_nonvlmax_insn (code_for_pred_broadcast (info.vmode), - riscv_vector::UNARY_OP, broadcast_ops, info.avl); + riscv_vector::expand_broadcast (info.vmode, broadcast_ops, info.avl); machine_mode mask_mode = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (info.vmode)) .require (); diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 242ac08..c9c8328 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -351,9 +351,12 @@ public: add_rounding_mode_operand (FRM_RNE); else if (m_insn_flags & VXRM_RNU_P) add_rounding_mode_operand (VXRM_RNU); + else if (m_insn_flags & VXRM_RNE_P) + add_rounding_mode_operand (VXRM_RNE); else if (m_insn_flags & VXRM_RDN_P) add_rounding_mode_operand (VXRM_RDN); - + else if (m_insn_flags & VXRM_ROD_P) + add_rounding_mode_operand (VXRM_ROD); if (insn_data[(int) icode].n_operands != m_opno) internal_error ("invalid number of operands for insn %s, " @@ -1190,6 +1193,59 @@ expand_vector_init_trailing_same_elem (rtx target, return false; } +/* Helper function to emit a vmv.vx/vi and float variants. + If VL is not given a VLMAX insn will be emitted, otherwise + a non-VLMAX insn with length VL. + If the value to be broadcast is not suitable for vmv.vx + fall back to a vlse with zero stride. This itself has a + fallback if the uarch prefers not to use a strided load + for broadcast. */ + +void +expand_broadcast (machine_mode mode, rtx *ops, rtx vl) +{ + rtx elt = ops[1]; + avl_type type = vl ? NONVLMAX : VLMAX; + if (can_be_broadcast_p (elt)) + emit_avltype_insn (code_for_pred_broadcast (mode), UNARY_OP, ops, + type, vl); + else + emit_avltype_insn (code_for_pred_strided_broadcast (mode), + UNARY_OP, ops, type, vl); +} + +/* Similar to expand_broadcast but emits a vmv.s.x/vfmv.s.f instead. */ + +void +expand_set_first (machine_mode mode, rtx *ops, rtx vl) +{ + rtx elt = ops[1]; + avl_type type = vl ? NONVLMAX : VLMAX; + if (can_be_broadcast_p (elt)) + emit_avltype_insn (code_for_pred_broadcast (mode), + SCALAR_MOVE_OP, ops, type, vl); + else + emit_avltype_insn (code_for_pred_strided_broadcast (mode), + SCALAR_MOVE_OP, ops, type, vl); +} + +/* Similar to expand_set_first but keeping the tail elements + unchanged (TU) */ + +void +expand_set_first_tu (machine_mode mode, rtx *ops, rtx vl) +{ + rtx elt = ops[2]; + if (!vl) + vl = const1_rtx; + if (can_be_broadcast_p (elt)) + emit_nonvlmax_insn (code_for_pred_broadcast (mode), + SCALAR_MOVE_MERGED_OP_TU, ops, vl); + else + emit_nonvlmax_insn (code_for_pred_strided_broadcast (mode), + SCALAR_MOVE_MERGED_OP_TU, ops, vl); +} + static void expand_const_vec_duplicate (rtx target, rtx src, rtx elt) { @@ -1226,7 +1282,7 @@ expand_const_vec_duplicate (rtx target, rtx src, rtx elt) if (lra_in_progress) { rtx ops[] = {result, elt}; - emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops); + expand_broadcast (mode, ops); } else { @@ -1278,8 +1334,7 @@ expand_const_vector_duplicate_repeating (rtx target, rvv_builder *builder) { dup = gen_reg_rtx (builder->new_mode ()); rtx ops[] = {dup, ele}; - emit_vlmax_insn (code_for_pred_broadcast (builder->new_mode ()), - UNARY_OP, ops); + expand_broadcast (builder->new_mode (), ops); } else dup = expand_vector_broadcast (builder->new_mode (), ele); @@ -1322,8 +1377,7 @@ expand_const_vector_duplicate_default (rtx target, rvv_builder *builder) rtx tmp1 = gen_reg_rtx (builder->mode ()); rtx dup_ops[] = {tmp1, builder->elt (0)}; - emit_vlmax_insn (code_for_pred_broadcast (builder->mode ()), UNARY_OP, - dup_ops); + expand_broadcast (builder->mode (), dup_ops); for (unsigned int i = 1; i < builder->npatterns (); i++) { @@ -2136,18 +2190,32 @@ has_vi_variant_p (rtx_code code, rtx x) } } +/* This is a helper for binary ops with DImode scalar operands that are + broadcast (like vadd.vx v1, a1). + Instead of having similar code for all the expanders this function + unifies the handling. For 64-bit targets all we do is choose + between the vi variant (if available) and the register variant. + For 32-bit targets we either create the sign-extending variant + of vop.vx (when the immediate fits 32 bits) or emit a vector + broadcast of the 64-bit register/immediate and switch to a + vop.vv (replacing the scalar op with the broadcast vector. */ + bool sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, machine_mode vector_mode, bool has_vi_variant_p, void (*emit_vector_func) (rtx *, rtx), enum avl_type type) { machine_mode scalar_mode = GET_MODE_INNER (vector_mode); + + /* If the scalar broadcast op fits an immediate, use the + vop.vi variant if there is one. */ if (has_vi_variant_p) { *scalar_op = force_reg (scalar_mode, *scalar_op); return false; } + /* On a 64-bit target we can always use the vop.vx variant. */ if (TARGET_64BIT) { if (!rtx_equal_p (*scalar_op, const0_rtx)) @@ -2155,6 +2223,8 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, return false; } + /* For 32 bit and if there is no vop.vi variant for a 32-bit immediate + we need to use the sign-extending (SI -> DI) vop.vx variants. */ if (immediate_operand (*scalar_op, Pmode)) { if (!rtx_equal_p (*scalar_op, const0_rtx)) @@ -2164,40 +2234,29 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, return false; } - bool avoid_strided_broadcast = false; + /* Now we're left with a 64-bit immediate or a register. + We cannot use a vop.vx variant but must broadcast the value first + and switch to a vop.vv variant. + Broadcast can either be done via vlse64.v v1, reg, zero + or by loading one 64-bit element (vle64.v) and using a + broadcast vrgather.vi. This is decided when splitting + the strided broadcast insn. */ + gcc_assert (!TARGET_64BIT + && (CONST_INT_P (*scalar_op) + || register_operand (*scalar_op, scalar_mode))); + if (CONST_INT_P (*scalar_op)) { if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode))) - { - if (strided_load_broadcast_p ()) - *scalar_op = force_const_mem (scalar_mode, *scalar_op); - else - avoid_strided_broadcast = true; - } + *scalar_op = force_const_mem (scalar_mode, *scalar_op); else *scalar_op = force_reg (scalar_mode, *scalar_op); } rtx tmp = gen_reg_rtx (vector_mode); - if (!avoid_strided_broadcast) - { - rtx ops[] = {tmp, *scalar_op}; - emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops, - type, vl); - } - else - { - /* Load scalar as V1DI and broadcast via vrgather.vi. */ - rtx tmp1 = gen_reg_rtx (V1DImode); - emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op, - scalar_mode)); - tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode); - - rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)}; - emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode), - BINARY_OP, ops); - } - + rtx ops[] = {tmp, *scalar_op}; + emit_avltype_insn (code_for_pred_strided_broadcast (vector_mode), + UNARY_OP, ops, type, vl); emit_vector_func (operands, tmp); return true; @@ -2591,8 +2650,7 @@ expand_vector_init_merge_repeating_sequence (rtx target, /* Step 1: Broadcast the first pattern. */ rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))}; - emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), - UNARY_OP, ops); + expand_broadcast (builder.mode (), ops); /* Step 2: Merge the rest iteration of pattern. */ for (unsigned int i = 1; i < builder.npatterns (); i++) { @@ -2605,8 +2663,7 @@ expand_vector_init_merge_repeating_sequence (rtx target, if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */ { rtx ops[] = {dup, merge_mask}; - emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)), - SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode)); + expand_set_first (GET_MODE (dup), ops); } else /* vmv.v.x. */ { @@ -2614,8 +2671,7 @@ expand_vector_init_merge_repeating_sequence (rtx target, force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)}; rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()), Pmode); - emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP, - ops, vl); + expand_broadcast (mask_int_mode, ops, vl); } emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup)); @@ -4706,20 +4762,20 @@ expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe, rtx m1_tmp = gen_reg_rtx (m1_mode); rtx scalar_move_ops[] = {m1_tmp, init}; - insn_code icode = code_for_pred_broadcast (m1_mode); if (need_mask_operand_p (insn_flags)) { if (need_vl0_safe) - emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx); + expand_set_first (m1_mode, scalar_move_ops, const1_rtx); else - emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op); + expand_set_first (m1_mode, scalar_move_ops, vl_op); } else - emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops); + expand_set_first (m1_mode, scalar_move_ops); rtx m1_tmp2 = gen_reg_rtx (m1_mode); rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp}; + insn_code icode; if (need_vl0_safe) icode = code_for_pred (unspec_for_vl0_safe, vmode); else @@ -5597,6 +5653,82 @@ expand_vx_binary_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops); } +static enum insn_type +get_insn_type_by_vxrm_val (int vxrm_val) +{ + enum insn_type itype; + + switch (vxrm_val) + { + case VXRM_RNU: + itype = BINARY_OP_VXRM_RNU; + break; + case VXRM_RNE: + itype = BINARY_OP_VXRM_RNE; + break; + case VXRM_RDN: + itype = BINARY_OP_VXRM_RDN; + break; + case VXRM_ROD: + itype = BINARY_OP_VXRM_ROD; + break; + default: + gcc_unreachable (); + } + + return itype; +} + +/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)) + and its' vxrm value. Aka the second op comes from the vec_duplicate, + and the first op is the vector reg. */ + +void +expand_vx_binary_vxrm_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2, int unspec, + int vxrm_val, machine_mode mode) +{ + enum insn_code icode; + enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val); + rtx ops[] = {op_0, op_1, op_2}; + + switch (unspec) + { + case UNSPEC_VAADD: + case UNSPEC_VAADDU: + icode = code_for_pred_scalar (unspec, mode); + break; + default: + gcc_unreachable (); + } + + emit_vlmax_insn (icode, itype, ops); +} + +/* Expand the binary vx combine with the format like v2 = vop(vec_dup(x), v1) + and its' vxrm value. Aka the second op comes from the vec_duplicate, + and the first op is the vector reg. */ + +void +expand_vx_binary_vxrm_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, int unspec, + int vxrm_val, machine_mode mode) +{ + enum insn_code icode; + enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val); + rtx ops[] = {op_0, op_1, op_2}; + + switch (unspec) + { + case UNSPEC_VAADD: + case UNSPEC_VAADDU: + icode = code_for_pred_scalar (unspec, mode); + break; + default: + gcc_unreachable (); + } + + emit_vlmax_insn (icode, itype, ops); +} + /* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)). Aka the second op comes from the vec_duplicate, and the first op is the vector reg. */ @@ -5808,25 +5940,84 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno) return count; } -/* Return true if the OP can be directly broadcast. */ +/* Return true if the OP can be broadcast with a + v[f]mv.v.[xif] instruction. */ + bool can_be_broadcast_p (rtx op) { machine_mode mode = GET_MODE (op); - /* We don't allow RA (register allocation) reload generate - (vec_duplicate:DI reg) in RV32 system wheras we allow - (vec_duplicate:DI mem) in RV32 system. */ - if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode) - && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode)) - && !satisfies_constraint_Wdm (op)) + + /* Zero always works and we can always put an immediate into a + register. + What's tricky is that for an immediate we don't know the + register's mode it will end up in, i.e. what element size + we want to broadcast. So even if the immediate is small it might + still end up in a DImode register that we cannot broadcast. + vmv.s.x, i.e. a single-element set can handle this, though, + because it implicitly sign-extends to SEW. */ + if (rtx_equal_p (op, CONST0_RTX (mode)) + || const_int_operand (op, Xmode)) + return true; + + /* Do not accept DImode broadcasts on !TARGET_64BIT. Those + are handled by strided broadcast. */ + if (INTEGRAL_MODE_P (mode) + && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD)) + return false; + + /* Non-register operands that can be forced into a register we can + handle. These don't need to use strided broadcast. */ + if (INTEGRAL_MODE_P (mode) + && (memory_operand (op, mode) || CONST_POLY_INT_P (op)) + && can_create_pseudo_p ()) + return true; + + /* Likewise, do not accept HFmode broadcast if we don't have + vfmv.v.f for 16-bit registers available. */ + if (mode == HFmode && !TARGET_ZVFH) + return false; + + /* Same for float, just that we can always handle 64-bit doubles + even on !TARGET_64BIT. We have ruled out 16-bit HF already + above. */ + if (FLOAT_MODE_P (mode) + && (memory_operand (op, mode) || CONSTANT_P (op)) + && can_create_pseudo_p ()) + return true; + + /* After excluding all the cases we cannot handle the register types + that remain can always be broadcast. */ + if (register_operand (op, mode)) + return true; + + return false; +} + +/* Returns true for all operands that cannot use vmv.vx, vfmv.vf, + vmv.s.x, or vfmv.s.f but rather need to go via memory. */ + +bool +strided_broadcast_p (rtx op) +{ + machine_mode mode = GET_MODE (op); + if (!memory_operand (op, mode) + && !register_operand (op, mode) + && !rtx_equal_p (op, CONST0_RTX (mode)) + && !const_int_operand (op, mode)) return false; - if (satisfies_constraint_K (op) || register_operand (op, mode) - || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op)) - || rtx_equal_p (op, CONST0_RTX (mode))) + /* !TARGET64_BIT does not have a vmv.v.x/vmv.s.x for 64-bit + DImode elements. */ + if (INTEGRAL_MODE_P (mode) + && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD)) + return true; + + /* Zvfhmin does not have a vfmv.v.f/vfmv.s.f. for 16-bit elements. */ + if (!TARGET_ZVFH && mode == HFmode) return true; - return can_create_pseudo_p () && nonmemory_operand (op, mode); + return false; } void @@ -5941,7 +6132,10 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index) return false; } -/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */ +/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. + That's the case if we're dealing with a scalar broadcast that + has VL = 1. */ + bool splat_to_scalar_move_p (rtx *ops) { diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc index bf5172c..7e4d396 100644 --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc @@ -643,7 +643,8 @@ public: return e.use_exact_insn (code_for_pred_mov (e.vector_mode ())); case OP_TYPE_x: case OP_TYPE_f: - return e.use_exact_insn (code_for_pred_broadcast (e.vector_mode ())); + return e.use_scalar_broadcast_insn + (code_for_pred_broadcast (e.vector_mode ())); default: gcc_unreachable (); } diff --git a/gcc/config/riscv/riscv-vector-builtins.cc b/gcc/config/riscv/riscv-vector-builtins.cc index 8810af0..0db7549 100644 --- a/gcc/config/riscv/riscv-vector-builtins.cc +++ b/gcc/config/riscv/riscv-vector-builtins.cc @@ -4753,7 +4753,10 @@ function_expander::use_ternop_insn (bool vd_accum_p, insn_code icode) } /* Implement the call using instruction ICODE, with a 1:1 mapping between - arguments and input operands. */ + arguments and input operands. + There are operands that cannot be broadcast using v[f]mv. In that case + we switch to a strided broadcast. */ + rtx function_expander::use_widen_ternop_insn (insn_code icode) { @@ -4794,7 +4797,10 @@ function_expander::use_widen_ternop_insn (insn_code icode) } /* Implement the call using instruction ICODE, with a 1:1 mapping between - arguments and input operands. */ + arguments and input operands. + There are operands that cannot be broadcast using v[f]mv. In that case + we switch to a strided broadcast. */ + rtx function_expander::use_scalar_move_insn (insn_code icode) { @@ -4812,6 +4818,37 @@ function_expander::use_scalar_move_insn (insn_code icode) for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++) add_input_operand (argno); + if (!can_be_broadcast_p (m_ops[3].value)) + icode = code_for_pred_strided_broadcast (vector_mode ()); + + add_input_operand (Pmode, get_tail_policy_for_pred (pred)); + add_input_operand (Pmode, get_mask_policy_for_pred (pred)); + add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX)); + return generate_insn (icode); +} + +/* Implement the call using instruction ICODE, with a 1:1 mapping between + arguments and input operands. */ +rtx +function_expander::use_scalar_broadcast_insn (insn_code icode) +{ + machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); + + /* Record the offset to get the argument. */ + int arg_offset = 0; + add_all_one_mask_operand (mask_mode ()); + + if (use_real_merge_p (pred)) + add_input_operand (arg_offset++); + else + add_vundef_operand (mode); + + for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++) + add_input_operand (argno); + + if (!can_be_broadcast_p (m_ops[3].value)) + icode = code_for_pred_strided_broadcast (vector_mode ()); + add_input_operand (Pmode, get_tail_policy_for_pred (pred)); add_input_operand (Pmode, get_mask_policy_for_pred (pred)); add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX)); diff --git a/gcc/config/riscv/riscv-vector-builtins.h b/gcc/config/riscv/riscv-vector-builtins.h index 1f2587a..86d8115 100644 --- a/gcc/config/riscv/riscv-vector-builtins.h +++ b/gcc/config/riscv/riscv-vector-builtins.h @@ -497,6 +497,7 @@ public: rtx use_ternop_insn (bool, insn_code); rtx use_widen_ternop_insn (insn_code); rtx use_scalar_move_insn (insn_code); + rtx use_scalar_broadcast_insn (insn_code); rtx generate_insn (insn_code); /* The function call expression. */ diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 4d8170d..1c6bc25 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -178,8 +178,8 @@ get_live_range (hash_map<tree, pair> *live_ranges, tree arg) STMT 5 (be vectorized) -- point 2 ... */ -static void -compute_local_program_points ( +void +costs::compute_local_program_points ( vec_info *vinfo, hash_map<basic_block, vec<stmt_point>> &program_points_per_bb) { @@ -274,14 +274,14 @@ loop_invariant_op_p (class loop *loop, /* Return true if the variable should be counted into liveness. */ static bool -variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var, - bool lhs_p) +variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, + slp_tree node, tree var, bool lhs_p) { if (!var) return false; gimple *stmt = STMT_VINFO_STMT (stmt_info); - enum stmt_vec_info_type type - = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)); + stmt_info = vect_stmt_to_vectorize (stmt_info); + enum stmt_vec_info_type type = SLP_TREE_TYPE (node); if (is_gimple_call (stmt) && gimple_call_internal_p (stmt)) { if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE @@ -357,8 +357,8 @@ variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var, The live range of SSA 1 is [1, 3] in bb 2. The live range of SSA 2 is [0, 4] in bb 3. */ -static machine_mode -compute_local_live_ranges ( +machine_mode +costs::compute_local_live_ranges ( loop_vec_info loop_vinfo, const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb, hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb) @@ -388,8 +388,11 @@ compute_local_live_ranges ( unsigned int point = program_point.point; gimple *stmt = program_point.stmt; tree lhs = gimple_get_lhs (stmt); - if (variable_vectorized_p (loop, program_point.stmt_info, lhs, - true)) + slp_tree *node = vinfo_slp_map.get (program_point.stmt_info); + if (!node) + continue; + if (variable_vectorized_p (loop, program_point.stmt_info, + *node, lhs, true)) { biggest_mode = get_biggest_mode (biggest_mode, TYPE_MODE (TREE_TYPE (lhs))); @@ -406,8 +409,8 @@ compute_local_live_ranges ( for (i = 0; i < gimple_num_args (stmt); i++) { tree var = gimple_arg (stmt, i); - if (variable_vectorized_p (loop, program_point.stmt_info, var, - false)) + if (variable_vectorized_p (loop, program_point.stmt_info, + *node, var, false)) { biggest_mode = get_biggest_mode (biggest_mode, @@ -597,11 +600,11 @@ get_store_value (gimple *stmt) } /* Return true if additional vector vars needed. */ -static bool -need_additional_vector_vars_p (stmt_vec_info stmt_info) +bool +costs::need_additional_vector_vars_p (stmt_vec_info stmt_info, + slp_tree node) { - enum stmt_vec_info_type type - = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)); + enum stmt_vec_info_type type = SLP_TREE_TYPE (node); if (type == load_vec_info_type || type == store_vec_info_type) { if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) @@ -657,8 +660,8 @@ compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode) Then, after this function, we update SSA 1 live range in bb 2 into [2, 4] since SSA 1 is live out into bb 3. */ -static void -update_local_live_ranges ( +void +costs::update_local_live_ranges ( vec_info *vinfo, hash_map<basic_block, vec<stmt_point>> &program_points_per_bb, hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb, @@ -685,8 +688,13 @@ update_local_live_ranges ( { gphi *phi = psi.phi (); stmt_vec_info stmt_info = vinfo->lookup_stmt (phi); - if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)) - == undef_vec_info_type) + stmt_info = vect_stmt_to_vectorize (stmt_info); + slp_tree *node = vinfo_slp_map.get (stmt_info); + + if (!node) + continue; + + if (SLP_TREE_TYPE (*node) == undef_vec_info_type) continue; for (j = 0; j < gimple_phi_num_args (phi); j++) @@ -761,9 +769,12 @@ update_local_live_ranges ( if (!is_gimple_assign_or_call (gsi_stmt (si))) continue; stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); - enum stmt_vec_info_type type - = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)); - if (need_additional_vector_vars_p (stmt_info)) + stmt_info = vect_stmt_to_vectorize (stmt_info); + slp_tree *node = vinfo_slp_map.get (stmt_info); + if (!node) + continue; + enum stmt_vec_info_type type = SLP_TREE_TYPE (*node); + if (need_additional_vector_vars_p (stmt_info, *node)) { /* For non-adjacent load/store STMT, we will potentially convert it into: @@ -816,8 +827,8 @@ update_local_live_ranges ( } /* Compute the maximum live V_REGS. */ -static bool -has_unexpected_spills_p (loop_vec_info loop_vinfo) +bool +costs::has_unexpected_spills_p (loop_vec_info loop_vinfo) { /* Compute local program points. It's a fast and effective computation. */ @@ -899,7 +910,11 @@ costs::analyze_loop_vinfo (loop_vec_info loop_vinfo) /* Detect whether we're vectorizing for VLA and should apply the unrolling heuristic described above m_unrolled_vls_niters. */ record_potential_vls_unrolling (loop_vinfo); +} +void +costs::record_lmul_spills (loop_vec_info loop_vinfo) +{ /* Detect whether the LOOP has unexpected spills. */ record_potential_unexpected_spills (loop_vinfo); } @@ -1239,8 +1254,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, int stmt_cost = targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign); + if (stmt_info && node) + vinfo_slp_map.put (stmt_info, node); + /* Do one-time initialization based on the vinfo. */ loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); + if (!m_analyzed_vinfo) { if (loop_vinfo) @@ -1326,6 +1345,8 @@ costs::finish_cost (const vector_costs *scalar_costs) { if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo)) { + record_lmul_spills (loop_vinfo); + adjust_vect_cost_per_loop (loop_vinfo); } vector_costs::finish_cost (scalar_costs); diff --git a/gcc/config/riscv/riscv-vector-costs.h b/gcc/config/riscv/riscv-vector-costs.h index de546a6..b84ceb1 100644 --- a/gcc/config/riscv/riscv-vector-costs.h +++ b/gcc/config/riscv/riscv-vector-costs.h @@ -91,7 +91,10 @@ private: typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; hash_set <tree_pair_hash> memrefs; + hash_map <stmt_vec_info, slp_tree> vinfo_slp_map; + void analyze_loop_vinfo (loop_vec_info); + void record_lmul_spills (loop_vec_info loop_vinfo); void record_potential_vls_unrolling (loop_vec_info); bool prefer_unrolled_loop () const; @@ -103,6 +106,19 @@ private: bool m_has_unexpected_spills_p = false; void record_potential_unexpected_spills (loop_vec_info); + void compute_local_program_points (vec_info *, + hash_map<basic_block, vec<stmt_point>> &); + void update_local_live_ranges (vec_info *, + hash_map<basic_block, vec<stmt_point>> &, + hash_map<basic_block, hash_map<tree, pair>> &, + machine_mode *); + machine_mode compute_local_live_ranges + (loop_vec_info, const hash_map<basic_block, vec<stmt_point>> &, + hash_map<basic_block, hash_map<tree, pair>> &); + + bool has_unexpected_spills_p (loop_vec_info); + bool need_additional_vector_vars_p (stmt_vec_info, slp_tree); + void adjust_vect_cost_per_loop (loop_vec_info); unsigned adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info, diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 3324819..0a9fcef 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -170,7 +170,7 @@ struct GTY(()) riscv_frame_info { }; enum riscv_privilege_levels { - UNKNOWN_MODE, USER_MODE, SUPERVISOR_MODE, MACHINE_MODE + UNKNOWN_MODE, SUPERVISOR_MODE, MACHINE_MODE, RNMI_MODE }; struct GTY(()) mode_switching_info { @@ -4040,6 +4040,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN switch (XINT (op, 1)) { case UNSPEC_VAADDU: + case UNSPEC_VAADD: *total = get_vector_binary_rtx_cost (op, scalar2vr_cost); break; @@ -6924,12 +6925,18 @@ riscv_handle_type_attribute (tree *node ATTRIBUTE_UNUSED, tree name, tree args, } string = TREE_STRING_POINTER (cst); - if (strcmp (string, "user") && strcmp (string, "supervisor") - && strcmp (string, "machine")) + if (!strcmp (string, "rnmi") && !TARGET_SMRNMI) + { + error ("attribute 'rnmi' requires the Smrnmi ISA extension"); + *no_add_attrs = true; + } + else if (strcmp (string, "supervisor") + && strcmp (string, "machine") + && strcmp (string, "rnmi")) { warning (OPT_Wattributes, - "argument to %qE attribute is not %<\"user\"%>, %<\"supervisor\"%>, " - "or %<\"machine\"%>", name); + "argument to %qE attribute is not %<\"supervisor\"%>, " + "%<\"machine\"%>, or %<\"rnmi\"%>", name); *no_add_attrs = true; } } @@ -9710,12 +9717,12 @@ riscv_expand_epilogue (int style) if (th_int_mask && TH_INT_INTERRUPT (cfun)) emit_jump_insn (gen_th_int_pop ()); - else if (mode == MACHINE_MODE) - emit_jump_insn (gen_riscv_mret ()); else if (mode == SUPERVISOR_MODE) emit_jump_insn (gen_riscv_sret ()); - else - emit_jump_insn (gen_riscv_uret ()); + else if (mode == RNMI_MODE) + emit_jump_insn (gen_riscv_mnret ()); + else /* Must be MACHINE_MODE. */ + emit_jump_insn (gen_riscv_mret ()); } else if (style != SIBCALL_RETURN) { @@ -12057,10 +12064,10 @@ riscv_get_interrupt_type (tree decl) { const char *string = TREE_STRING_POINTER (TREE_VALUE (attr_args)); - if (!strcmp (string, "user")) - return USER_MODE; - else if (!strcmp (string, "supervisor")) + if (!strcmp (string, "supervisor")) return SUPERVISOR_MODE; + else if (!strcmp (string, "rnmi")) + return RNMI_MODE; else /* Must be "machine". */ return MACHINE_MODE; } @@ -12677,14 +12684,31 @@ riscv_estimated_poly_value (poly_int64 val, /* Return true if the vector misalignment factor is supported by the target. */ bool -riscv_support_vector_misalignment (machine_mode mode, - const_tree type ATTRIBUTE_UNUSED, - int misalignment, - bool is_packed ATTRIBUTE_UNUSED) +riscv_support_vector_misalignment (machine_mode mode, const_tree type, + int misalignment, bool is_packed, + bool is_gather_scatter) { - /* Depend on movmisalign pattern. */ + /* IS_PACKED is true if the corresponding scalar element is not naturally + aligned. If the misalignment is unknown and the the access is packed + we defer to the default hook which will check if movmisalign is present. + Movmisalign, in turn, depends on TARGET_VECTOR_MISALIGN_SUPPORTED. */ + if (misalignment == DR_MISALIGNMENT_UNKNOWN) + { + if (!is_packed) + return true; + } + else + { + /* If we know that misalignment is a multiple of the element size, we're + good. */ + if (misalignment % TYPE_ALIGN_UNIT (type) == 0) + return true; + } + + /* Otherwise fall back to movmisalign again. */ return default_builtin_support_vector_misalignment (mode, type, misalignment, - is_packed); + is_packed, + is_gather_scatter); } /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index c3b504d..578dd43 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -120,7 +120,7 @@ ;; Interrupt handler instructions. UNSPECV_MRET UNSPECV_SRET - UNSPECV_URET + UNSPECV_MNRET ;; Blockage and synchronization. UNSPECV_BLOCKAGE @@ -4166,11 +4166,11 @@ "sret" [(set_attr "type" "ret")]) -(define_insn "riscv_uret" +(define_insn "riscv_mnret" [(return) - (unspec_volatile [(const_int 0)] UNSPECV_URET)] - "" - "uret" + (unspec_volatile [(const_int 0)] UNSPECV_MNRET)] + "TARGET_SMRNMI" + "mnret" [(set_attr "type" "ret")]) (define_insn "stack_tie<mode>" diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv index 7aac56a..a7eaa8b 100644 --- a/gcc/config/riscv/t-riscv +++ b/gcc/config/riscv/t-riscv @@ -229,8 +229,41 @@ s-riscv-ext.texi: build/gen-riscv-ext-texi$(build_exeext) $(SHELL) $(srcdir)/../move-if-change tmp-riscv-ext.texi $(srcdir)/doc/riscv-ext.texi $(STAMP) s-riscv-ext.texi -# Run `riscv-regen' after you changed or added anything from riscv-ext*.def +RISCV_CORES_DEFS = \ + $(srcdir)/config/riscv/riscv-cores.def + +build/gen-riscv-mtune-texi.o: $(srcdir)/config/riscv/gen-riscv-mtune-texi.cc \ + $(RISCV_CORES_DEFS) + $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@ + +build/gen-riscv-mcpu-texi.o: $(srcdir)/config/riscv/gen-riscv-mcpu-texi.cc \ + $(RISCV_CORES_DEFS) + $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@ + +build/gen-riscv-mtune-texi$(build_exeext): build/gen-riscv-mtune-texi.o + $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $< + +build/gen-riscv-mcpu-texi$(build_exeext): build/gen-riscv-mcpu-texi.o + $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $< + +$(srcdir)/doc/riscv-mtune.texi: $(RISCV_CORES_DEFS) +$(srcdir)/doc/riscv-mtune.texi: s-riscv-mtune.texi ; @true + +$(srcdir)/doc/riscv-mcpu.texi: $(RISCV_CORES_DEFS) +$(srcdir)/doc/riscv-mcpu.texi: s-riscv-mcpu.texi ; @true + +s-riscv-mtune.texi: build/gen-riscv-mtune-texi$(build_exeext) + $(RUN_GEN) build/gen-riscv-mtune-texi$(build_exeext) > tmp-riscv-mtune.texi + $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mtune.texi $(srcdir)/doc/riscv-mtune.texi + $(STAMP) s-riscv-mtune.texi + +s-riscv-mcpu.texi: build/gen-riscv-mcpu-texi$(build_exeext) + $(RUN_GEN) build/gen-riscv-mcpu-texi$(build_exeext) > tmp-riscv-mcpu.texi + $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mcpu.texi $(srcdir)/doc/riscv-mcpu.texi + $(STAMP) s-riscv-mcpu.texi + +# Run `riscv-regen' after you changed or added anything from riscv-ext*.def and riscv-cores*.def .PHONY: riscv-regen -riscv-regen: s-riscv-ext.texi s-riscv-ext.opt +riscv-regen: s-riscv-ext.texi s-riscv-ext.opt s-riscv-mtune.texi s-riscv-mcpu.texi diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md index 5f6cc42..aa3b6fb 100644 --- a/gcc/config/riscv/vector-iterators.md +++ b/gcc/config/riscv/vector-iterators.md @@ -4013,6 +4013,14 @@ UNSPEC_VASUBU UNSPEC_VASUB UNSPEC_VSMUL UNSPEC_VSSRL UNSPEC_VSSRA]) +(define_int_iterator VSAT_VX_OP_V_VDUP [ + UNSPEC_VAADDU UNSPEC_VAADD +]) + +(define_int_iterator VSAT_VX_OP_VDUP_V [ + UNSPEC_VAADDU UNSPEC_VAADD +]) + (define_int_iterator VSAT_ARITH_OP [UNSPEC_VAADDU UNSPEC_VAADD UNSPEC_VASUBU UNSPEC_VASUB UNSPEC_VSMUL]) (define_int_iterator VSAT_SHIFT_OP [UNSPEC_VSSRL UNSPEC_VSSRA]) @@ -4047,6 +4055,14 @@ (UNSPEC_VSSRA "vsshift") (UNSPEC_VNCLIP "vnclip") (UNSPEC_VNCLIPU "vnclip")]) +(define_int_attr sat_op_v_vdup [ + (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd") +]) + +(define_int_attr sat_op_vdup_v [ + (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd") +]) + (define_int_attr misc_op [(UNSPEC_VMSBF "sbf") (UNSPEC_VMSIF "sif") (UNSPEC_VMSOF "sof") (UNSPEC_VFRSQRT7 "rsqrt7")]) diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index c498166..66b7670 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -1551,20 +1551,44 @@ (define_expand "vec_duplicate<mode>" [(set (match_operand:V_VLS 0 "register_operand") (vec_duplicate:V_VLS - (match_operand:<VEL> 1 "direct_broadcast_operand")))] + (match_operand:<VEL> 1 "any_broadcast_operand")))] "TARGET_VECTOR" { - /* Early expand DImode broadcast in RV32 system to avoid RA reload - generate (set (reg) (vec_duplicate:DI)). */ + /* Don't keep a DImode broadcast for RV32 in the vec_duplicate form. + Otherwise combine or late combine could end up doing + "64-bit broadcast" (!= vmv.v.x) + + vadd.vv + = vadd.vx + which would be invalid. */ bool gt_p = maybe_gt (GET_MODE_SIZE (<VEL>mode), GET_MODE_SIZE (Pmode)); if (!FLOAT_MODE_P (<VEL>mode) && gt_p) { - riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), - riscv_vector::UNARY_OP, operands); - DONE; + riscv_vector::emit_vlmax_insn + (code_for_pred_strided_broadcast + (<MODE>mode), riscv_vector::UNARY_OP, operands); + DONE; } - /* Otherwise, allow it fall into general vec_duplicate pattern - which allow us to have vv->vx combine optimization in later pass. */ + + /* Even though we can eventually broadcast any permissible + constant by moving it into a register we need to force + any non-immediate one into a register here. + If we didn't do that we couldn't fwprop/late-combine + vec_duplicate 123.45f + + vfadd.vv + = vfadd.vf + because the constant is valid for vec_duplicate but not + for vfadd.vf. Therefore we need to do + fa0 = 123.45f + vec_duplicate fa0 + + vfadd.vv + = vfadd.vf */ + if (!satisfies_constraint_P (operands[1]) + && !satisfies_constraint_J (operands[1]) + && !rtx_equal_p (operands[1], CONST0_RTX (<VEL>mode)) + && !memory_operand (operands[1], <VEL>mode)) + operands[1] = force_reg (<VEL>mode, operands[1]); + + /* Otherwise keep the vec_duplicate pattern until split. */ }) ;; According to GCC internal: @@ -1574,28 +1598,20 @@ (define_insn_and_split "*vec_duplicate<mode>" [(set (match_operand:V_VLS 0 "register_operand") (vec_duplicate:V_VLS - (match_operand:<VEL> 1 "direct_broadcast_operand")))] + (match_operand:<VEL> 1 "any_broadcast_operand")))] "TARGET_VECTOR && can_create_pseudo_p ()" "#" "&& 1" [(const_int 0)] { - if (!strided_load_broadcast_p () - && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode) - { - /* For Float16, reinterpret as HImode, broadcast and reinterpret - back. */ - poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode); - machine_mode vmodehi - = riscv_vector::get_vector_mode (HImode, nunits).require (); - rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode), - lowpart_subreg (HImode, operands[1], HFmode)}; - riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi), - riscv_vector::UNARY_OP, ops); - } - else + if (riscv_vector::can_be_broadcast_p (operands[1])) riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), riscv_vector::UNARY_OP, operands); + else + riscv_vector::emit_vlmax_insn (code_for_pred_strided_broadcast + (<MODE>mode), riscv_vector::UNARY_OP, + operands); + DONE; } [(set_attr "type" "vector")] @@ -2141,69 +2157,45 @@ (match_operand:V_VLS 2 "vector_merge_operand")))] "TARGET_VECTOR" { - /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f - has better chances to do vsetvl fusion in vsetvl pass. */ bool wrap_vec_dup = true; rtx vec_cst = NULL_RTX; - if (riscv_vector::splat_to_scalar_move_p (operands)) - { - operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode); - operands[3] = force_reg (<VEL>mode, operands[3]); - } - else if (immediate_operand (operands[3], <VEL>mode) - && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3])) - && (/* -> pred_broadcast<mode>_zero */ - (vector_least_significant_set_mask_operand (operands[1], - <VM>mode) - && vector_const_0_operand (vec_cst, <MODE>mode)) - || (/* pred_broadcast<mode>_imm */ - vector_all_trues_mask_operand (operands[1], <VM>mode) - && vector_const_int_or_double_0_operand (vec_cst, - <MODE>mode)))) + if (immediate_operand (operands[3], <VEL>mode) + && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3])) + && (/* -> pred_broadcast<mode>_zero */ + (vector_least_significant_set_mask_operand (operands[1], + <VM>mode) + && vector_const_0_operand (vec_cst, <MODE>mode)) + || (/* pred_broadcast<mode>_imm */ + vector_all_trues_mask_operand (operands[1], <VM>mode) + && vector_const_int_or_double_0_operand (vec_cst, + <MODE>mode)))) { operands[3] = vec_cst; wrap_vec_dup = false; } - /* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar. */ - else if (satisfies_constraint_Wdm (operands[3])) - { - if (satisfies_constraint_Wb1 (operands[1])) - { - /* Case 1: vmv.s.x (TA, x == memory) ==> vlse.v (TA) */ - if (satisfies_constraint_vu (operands[2])) - operands[1] = CONSTM1_RTX (<VM>mode); - else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)) - { - /* Case 2: vmv.s.x (TU, x == memory) ==> - vl = 0 or 1; + vlse.v (TU) in RV32 system */ - operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]); - operands[1] = CONSTM1_RTX (<VM>mode); - } - else - /* Case 3: load x (memory) to register. */ - operands[3] = force_reg (<VEL>mode, operands[3]); - } - } - else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode) - && (immediate_operand (operands[3], Pmode) + else if (GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD + && satisfies_constraint_Wb1 (operands[1]) + && (immediate_operand (operands[3], Xmode) || (CONST_POLY_INT_P (operands[3]) && known_ge (rtx_to_poly_int64 (operands[3]), 0U) - && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode))))) + && known_le (rtx_to_poly_int64 (operands[3]), + GET_MODE_SIZE (<MODE>mode))))) { rtx tmp = gen_reg_rtx (Pmode); poly_int64 value = rtx_to_poly_int64 (operands[3]); - emit_move_insn (tmp, gen_int_mode (value, Pmode)); + emit_move_insn (tmp, gen_int_mode (value, Xmode)); operands[3] = gen_rtx_SIGN_EXTEND (<VEL>mode, tmp); } - /* Never load (const_int 0) into a register, that's silly. */ - else if (operands[3] == CONST0_RTX (<VEL>mode)) + + /* For a vmv.v.x never load (const_int 0) or valid immediate operands + into a register, because we can use vmv.v.i. */ + else if (satisfies_constraint_Wc1 (operands[1]) + && (satisfies_constraint_P (operands[3]) + || operands[3] == CONST0_RTX (<VEL>mode))) ; - /* If we're broadcasting [-16..15] across more than just - element 0, then we can use vmv.v.i directly, thus avoiding - the load of the constant into a GPR. */ - else if (CONST_INT_P (operands[3]) - && IN_RANGE (INTVAL (operands[3]), -16, 15) - && !satisfies_constraint_Wb1 (operands[1])) + /* For vmv.s.x we have vmv.s.x v1, zero. */ + else if (satisfies_constraint_Wb1 (operands[1]) + && operands[3] == CONST0_RTX (<VEL>mode)) ; else operands[3] = force_reg (<VEL>mode, operands[3]); @@ -2211,131 +2203,68 @@ operands[3] = gen_rtx_VEC_DUPLICATE (<MODE>mode, operands[3]); }) -(define_insn_and_split "*pred_broadcast<mode>" - [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vd, vd, vr, vr, vr, vr") +(define_insn_and_rewrite "*pred_broadcast<mode>" + [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vr, vr") (if_then_else:V_VLSI (unspec:<VM> - [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1, vm, vm,Wc1,Wc1,Wb1,Wb1") - (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl,rvl,rvl,rvl,rvl") - (match_operand 5 "const_int_operand" " i, i, i, i, i, i, i, i") - (match_operand 6 "const_int_operand" " i, i, i, i, i, i, i, i") - (match_operand 7 "const_int_operand" " i, i, i, i, i, i, i, i") + [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1") + (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl") + (match_operand 5 "const_int_operand" " i, i, i, i") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) (vec_duplicate:V_VLSI - (match_operand:<VEL> 3 "direct_broadcast_operand" "rP,rP,Wdm,Wdm,Wdm,Wdm, rJ, rJ")) - (match_operand:V_VLSI 2 "vector_merge_operand" "vu, 0, vu, 0, vu, 0, vu, 0")))] + (match_operand:<VEL> 3 "direct_broadcast_operand" " rP, rP, rJ, rJ")) + (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" "@ vmv.v.%o3\t%0,%3 vmv.v.%o3\t%0,%3 - vlse<sew>.v\t%0,%3,zero,%1.t - vlse<sew>.v\t%0,%3,zero,%1.t - vlse<sew>.v\t%0,%3,zero - vlse<sew>.v\t%0,%3,zero vmv.s.x\t%0,%z3 vmv.s.x\t%0,%z3" - "(register_operand (operands[3], <VEL>mode) - || CONST_POLY_INT_P (operands[3])) - && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)" - [(const_int 0)] - { - gcc_assert (can_create_pseudo_p ()); - if (CONST_POLY_INT_P (operands[3])) - { - rtx tmp = gen_reg_rtx (<VEL>mode); - emit_move_insn (tmp, operands[3]); - operands[3] = tmp; - } - - /* For SEW = 64 in RV32 system, we expand vmv.s.x: - andi a2,a2,1 - vsetvl zero,a2,e64 - vlse64.v */ - if (satisfies_constraint_Wb1 (operands[1])) - { - operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]); - operands[1] = CONSTM1_RTX (<VM>mode); - } - - /* If the target doesn't want a strided-load broadcast we go with a regular - V1DImode load and a broadcast gather. */ - if (strided_load_broadcast_p ()) - { - rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode), - GET_MODE_ALIGNMENT (<VEL>mode)); - mem = validize_mem (mem); - emit_move_insn (mem, operands[3]); - mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0))); - - emit_insn - (gen_pred_broadcast<mode> - (operands[0], operands[1], operands[2], mem, - operands[4], operands[5], operands[6], operands[7])); - } - else - { - rtx tmp = gen_reg_rtx (V1DImode); - emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3], - <VEL>mode)); - tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode); - - emit_insn - (gen_pred_gather<mode>_scalar - (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode), - operands[4], operands[5], operands[6], operands[7])); - } - DONE; - } - [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv") + "&& (operands[1] == CONSTM1_RTX (<VM>mode) + && operands[4] == CONST1_RTX (Pmode) + && (register_operand (operands[3], <VEL>mode) + || satisfies_constraint_J (operands[3])))" +{ + /* A broadcast of a single element is just a vmv.s.x. */ + operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode); +} + [(set_attr "type" "vimov,vimov,vimovxv,vimovxv") (set_attr "mode" "<MODE>")]) -(define_insn "*pred_broadcast<mode>_zvfh" - [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr") +(define_insn_and_rewrite "pred_broadcast<mode>_zvfh" + [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr") (if_then_else:V_VLSF (unspec:<VM> - [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1, Wc1, Wb1, Wb1") - (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl") - (match_operand 5 "const_int_operand" " i, i, i, i") - (match_operand 6 "const_int_operand" " i, i, i, i") - (match_operand 7 "const_int_operand" " i, i, i, i") + [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1") + (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl") + (match_operand 5 "const_int_operand" " i, i, i, i") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) (vec_duplicate:V_VLSF - (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f")) - (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))] + (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f")) + (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" "@ vfmv.v.f\t%0,%3 vfmv.v.f\t%0,%3 vfmv.s.f\t%0,%3 vfmv.s.f\t%0,%3" + "&& (operands[1] == CONSTM1_RTX (<VM>mode) + && operands[4] == CONST1_RTX (Pmode) + && (register_operand (operands[3], <VEL>mode) + || satisfies_constraint_J (operands[3])))" +{ + /* A broadcast of a single element is just a vfmv.s.f. */ + operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode); +} [(set_attr "type" "vfmov,vfmov,vfmovfv,vfmovfv") (set_attr "mode" "<MODE>")]) -(define_insn "*pred_broadcast<mode>_zvfhmin" - [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr") - (if_then_else:V_VLSF_ZVFHMIN - (unspec:<VM> - [(match_operand:<VM> 1 "vector_broadcast_mask_operand" " vm, vm, Wc1, Wc1") - (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl") - (match_operand 5 "const_int_operand" " i, i, i, i") - (match_operand 6 "const_int_operand" " i, i, i, i") - (match_operand 7 "const_int_operand" " i, i, i, i") - (reg:SI VL_REGNUM) - (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) - (vec_duplicate:V_VLSF_ZVFHMIN - (match_operand:<VEL> 3 "direct_broadcast_operand" " A, A, A, A")) - (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))] - "TARGET_VECTOR && strided_load_broadcast_p ()" - "@ - vlse<sew>.v\t%0,%3,zero,%1.t - vlse<sew>.v\t%0,%3,zero,%1.t - vlse<sew>.v\t%0,%3,zero - vlse<sew>.v\t%0,%3,zero" - [(set_attr "type" "vlds,vlds,vlds,vlds") - (set_attr "mode" "<MODE>")]) - (define_insn "*pred_broadcast<mode>_extended_scalar" [(set (match_operand:V_VLSI_D 0 "register_operand" "=vr, vr, vr, vr") (if_then_else:V_VLSI_D @@ -2398,6 +2327,117 @@ [(set_attr "type" "vimov,vimov") (set_attr "mode" "<MODE>")]) +(define_expand "@pred_strided_broadcast<mode>" + [(set (match_operand:V_VLS 0 "register_operand") + (if_then_else:V_VLS + (unspec:<VM> + [(match_operand:<VM> 1 "strided_broadcast_mask_operand") + (match_operand 4 "vector_length_operand") + (match_operand 5 "const_int_operand") + (match_operand 6 "const_int_operand") + (match_operand 7 "const_int_operand") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (vec_duplicate:V_VLS + (match_operand:<VEL> 3 "strided_broadcast_operand")) + (match_operand:V_VLS 2 "vector_merge_operand")))] + "TARGET_VECTOR" +{ + if (satisfies_constraint_Wb1 (operands[1])) + { + /* If we're asked to set a single element (like vmv.s.x but we + need to go via memory here) and the tail policy is agnostic + we can overwrite all elements. + Thus, set the mask to broadcast. */ + operands[1] = CONSTM1_RTX (<VM>mode); + if (!satisfies_constraint_vu (operands[2]) + && GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD) + { + /* Case 2: vmv.s.x (TU, x == memory) ==> + vl = 0 or 1; + vlse.v (TU) in RV32 system */ + /* In this case we must not overwrite the residual elements, + so set the vector length to 0/1. */ + operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]); + } + } +}) + +(define_insn_and_split "*pred_strided_broadcast<mode>" + [(set (match_operand:V_VLSI 0 "register_operand" "=vd, vd, vr, vr") + (if_then_else:V_VLSI + (unspec:<VM> + [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm,Wc1,Wc1") + (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl") + (match_operand 5 "const_int_operand" " i, i, i, i") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (vec_duplicate:V_VLSI + (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A")) + (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))] + "TARGET_VECTOR" + "@ + vlse<sew>.v\t%0,%3,zero,%1.t + vlse<sew>.v\t%0,%3,zero,%1.t + vlse<sew>.v\t%0,%3,zero + vlse<sew>.v\t%0,%3,zero" + "&& !strided_load_broadcast_p () && can_create_pseudo_p ()" + [(const_int 0)] + { + rtx tmp = gen_reg_rtx (V1DImode); + emit_move_insn (tmp, gen_lowpart (V1DImode, operands[3])); + tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode); + + emit_insn + (gen_pred_gather<mode>_scalar + (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode), + operands[4], operands[5], operands[6], operands[7])); + DONE; + } + [(set_attr "type" "vlds,vlds,vlds,vlds") + (set_attr "mode" "<MODE>")]) + +(define_insn_and_split "*pred_strided_broadcast<mode>_zvfhmin" + [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr") + (if_then_else:V_VLSF_ZVFHMIN + (unspec:<VM> + [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm, Wc1, Wc1") + (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl") + (match_operand 5 "const_int_operand" " i, i, i, i") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (vec_duplicate:V_VLSF_ZVFHMIN + (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A")) + (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))] + "TARGET_VECTOR" + "@ + vlse<sew>.v\t%0,%3,zero,%1.t + vlse<sew>.v\t%0,%3,zero,%1.t + vlse<sew>.v\t%0,%3,zero + vlse<sew>.v\t%0,%3,zero" + "&& !strided_load_broadcast_p () + && <VEL>mode == HFmode + && can_create_pseudo_p ()" + [(const_int 0)] + { + poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode); + machine_mode vmodehi + = riscv_vector::get_vector_mode (HImode, nunits).require (); + rtx ops[] = {gen_lowpart (vmodehi, operands[0]), + gen_lowpart (HImode, operands[3])}; + riscv_vector::emit_avltype_insn (code_for_pred_broadcast (vmodehi), + riscv_vector::UNARY_OP, ops, + (riscv_vector::avl_type) INTVAL (operands[7]), + operands[4]); + DONE; + } + [(set_attr "type" "vlds,vlds,vlds,vlds") + (set_attr "mode" "<MODE>")]) + + ;; ------------------------------------------------------------------------------- ;; ---- Predicated Strided loads/stores ;; ------------------------------------------------------------------------------- |