aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/riscv
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/riscv')
-rw-r--r--gcc/config/riscv/autovec-opt.md31
-rw-r--r--gcc/config/riscv/autovec.md7
-rw-r--r--gcc/config/riscv/gen-riscv-mcpu-texi.cc43
-rw-r--r--gcc/config/riscv/gen-riscv-mtune-texi.cc41
-rw-r--r--gcc/config/riscv/generic-vector-ooo.md85
-rw-r--r--gcc/config/riscv/predicates.md13
-rw-r--r--gcc/config/riscv/riscv-ext.def30
-rw-r--r--gcc/config/riscv/riscv-protos.h16
-rw-r--r--gcc/config/riscv/riscv-string.cc6
-rw-r--r--gcc/config/riscv/riscv-v.cc302
-rw-r--r--gcc/config/riscv/riscv-vector-builtins-bases.cc3
-rw-r--r--gcc/config/riscv/riscv-vector-builtins.cc41
-rw-r--r--gcc/config/riscv/riscv-vector-builtins.h1
-rw-r--r--gcc/config/riscv/riscv-vector-costs.cc71
-rw-r--r--gcc/config/riscv/riscv-vector-costs.h16
-rw-r--r--gcc/config/riscv/riscv.cc60
-rw-r--r--gcc/config/riscv/riscv.md10
-rw-r--r--gcc/config/riscv/t-riscv37
-rw-r--r--gcc/config/riscv/vector-iterators.md16
-rw-r--r--gcc/config/riscv/vector.md380
20 files changed, 864 insertions, 345 deletions
diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index d884942..6531996 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1714,7 +1714,7 @@
}
[(set_attr "type" "vialu")])
-(define_insn_and_split "*uavg_floor_vx_<mode>"
+(define_insn_and_split "*<sat_op_v_vdup>_vx_<mode>"
[(set (match_operand:V_VLSI 0 "register_operand")
(if_then_else:V_VLSI
(unspec:<VM>
@@ -1730,7 +1730,7 @@
(unspec:V_VLSI
[(match_operand:V_VLSI 3 "register_operand")
(vec_duplicate:V_VLSI
- (match_operand:<VEL> 4 "register_operand"))] UNSPEC_VAADDU)
+ (match_operand:<VEL> 4 "reg_or_int_operand"))] VSAT_VX_OP_V_VDUP)
(unspec:V_VLSI
[(match_operand:DI 2 "register_operand")] UNSPEC_VUNDEF)))]
"TARGET_VECTOR && can_create_pseudo_p ()"
@@ -1738,14 +1738,17 @@
"&& 1"
[(const_int 0)]
{
- insn_code code = code_for_pred_scalar (UNSPEC_VAADDU, <MODE>mode);
- rtx ops[] = {operands[0], operands[3], operands[4]};
- riscv_vector::emit_vlmax_insn (code, riscv_vector::BINARY_OP_VXRM_RDN, ops);
+ int vxrm_val = INTVAL (operands[9]);
+ riscv_vector::expand_vx_binary_vxrm_vec_vec_dup (operands[0], operands[3],
+ operands[4],
+ <VSAT_VX_OP_V_VDUP>,
+ vxrm_val, <MODE>mode);
+
DONE;
}
[(set_attr "type" "vaalu")])
-(define_insn_and_split "*uavg_floor_vx_<mode>"
+(define_insn_and_split "*<sat_op_vdup_v>_vx_<mode>"
[(set (match_operand:V_VLSI 0 "register_operand")
(if_then_else:V_VLSI
(unspec:<VM>
@@ -1760,8 +1763,8 @@
(reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE)
(unspec:V_VLSI
[(vec_duplicate:V_VLSI
- (match_operand:<VEL> 4 "register_operand"))
- (match_operand:V_VLSI 3 "register_operand")] UNSPEC_VAADDU)
+ (match_operand:<VEL> 4 "reg_or_int_operand"))
+ (match_operand:V_VLSI 3 "register_operand")] VSAT_VX_OP_VDUP_V)
(unspec:V_VLSI
[(match_operand:DI 2 "register_operand")] UNSPEC_VUNDEF)))]
"TARGET_VECTOR && can_create_pseudo_p ()"
@@ -1769,9 +1772,12 @@
"&& 1"
[(const_int 0)]
{
- insn_code code = code_for_pred_scalar (UNSPEC_VAADDU, <MODE>mode);
- rtx ops[] = {operands[0], operands[3], operands[4]};
- riscv_vector::emit_vlmax_insn (code, riscv_vector::BINARY_OP_VXRM_RDN, ops);
+ int vxrm_val = INTVAL (operands[9]);
+ riscv_vector::expand_vx_binary_vxrm_vec_dup_vec (operands[0], operands[3],
+ operands[4],
+ <VSAT_VX_OP_VDUP_V>,
+ vxrm_val, <MODE>mode);
+
DONE;
}
[(set_attr "type" "vaalu")])
@@ -1900,8 +1906,7 @@
emit_insn (gen_extend<vsubel><vel>2(tmp, operands[1]));
rtx ops[] = {operands[0], tmp};
- riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::UNARY_OP, ops);
+ riscv_vector::expand_broadcast (<MODE>mode, ops);
DONE;
}
[(set_attr "type" "vfwmuladd")]
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 1fff8ac..48de5ef 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1359,9 +1359,7 @@
if (operands[2] == const0_rtx)
{
rtx ops[] = {operands[0], operands[0], operands[1]};
- riscv_vector::emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::SCALAR_MOVE_MERGED_OP_TU,
- ops, CONST1_RTX (Pmode));
+ riscv_vector::expand_set_first_tu (<MODE>mode, ops);
}
else
{
@@ -1385,8 +1383,7 @@
VL we need for the slide. */
rtx tmp = gen_reg_rtx (<MODE>mode);
rtx ops1[] = {tmp, operands[1]};
- emit_nonvlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::UNARY_OP, ops1, length);
+ riscv_vector::expand_broadcast (<MODE>mode, ops1, length);
/* Slide exactly one element up leaving the tail elements
unchanged. */
diff --git a/gcc/config/riscv/gen-riscv-mcpu-texi.cc b/gcc/config/riscv/gen-riscv-mcpu-texi.cc
new file mode 100644
index 0000000..9681438
--- /dev/null
+++ b/gcc/config/riscv/gen-riscv-mcpu-texi.cc
@@ -0,0 +1,43 @@
+#include <string>
+#include <vector>
+#include <stdio.h>
+
+int
+main ()
+{
+ puts ("@c Copyright (C) 2025 Free Software Foundation, Inc.");
+ puts ("@c This is part of the GCC manual.");
+ puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi.");
+ puts ("");
+ puts ("@c This file is generated automatically using");
+ puts ("@c gcc/config/riscv/gen-riscv-mcpu-texi.cc from:");
+ puts ("@c gcc/config/riscv/riscv-cores.def");
+ puts ("");
+ puts ("@c Please *DO NOT* edit manually.");
+ puts ("");
+ puts ("@samp{Core Name}");
+ puts ("");
+ puts ("@opindex mcpu");
+ puts ("@item -mcpu=@var{processor-string}");
+ puts ("Use architecture of and optimize the output for the given processor, specified");
+ puts ("by particular CPU name. Permissible values for this option are:");
+ puts ("");
+ puts ("");
+
+ std::vector<std::string> coreNames;
+
+#define RISCV_CORE(CORE_NAME, ARCH, MICRO_ARCH) \
+ coreNames.push_back (CORE_NAME);
+#include "riscv-cores.def"
+#undef RISCV_CORE
+
+ for (size_t i = 0; i < coreNames.size(); ++i) {
+ if (i == coreNames.size() - 1) {
+ printf("@samp{%s}.\n", coreNames[i].c_str());
+ } else {
+ printf("@samp{%s},\n\n", coreNames[i].c_str());
+ }
+ }
+
+ return 0;
+}
diff --git a/gcc/config/riscv/gen-riscv-mtune-texi.cc b/gcc/config/riscv/gen-riscv-mtune-texi.cc
new file mode 100644
index 0000000..1bdfe2a
--- /dev/null
+++ b/gcc/config/riscv/gen-riscv-mtune-texi.cc
@@ -0,0 +1,41 @@
+#include <string>
+#include <vector>
+#include <stdio.h>
+
+int
+main ()
+{
+ puts ("@c Copyright (C) 2025 Free Software Foundation, Inc.");
+ puts ("@c This is part of the GCC manual.");
+ puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi.");
+ puts ("");
+ puts ("@c This file is generated automatically using");
+ puts ("@c gcc/config/riscv/gen-riscv-mtune-texi.cc from:");
+ puts ("@c gcc/config/riscv/riscv-cores.def");
+ puts ("");
+ puts ("@c Please *DO NOT* edit manually.");
+ puts ("");
+ puts ("@samp{Tune Name}");
+ puts ("");
+ puts ("@opindex mtune");
+ puts ("@item -mtune=@var{processor-string}");
+ puts ("Optimize the output for the given processor, specified by microarchitecture or");
+ puts ("particular CPU name. Permissible values for this option are:");
+ puts ("");
+ puts ("");
+
+ std::vector<std::string> tuneNames;
+
+#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO) \
+ tuneNames.push_back (TUNE_NAME);
+#include "riscv-cores.def"
+#undef RISCV_TUNE
+
+ for (size_t i = 0; i < tuneNames.size(); ++i) {
+ printf("@samp{%s},\n\n", tuneNames[i].c_str());
+ }
+
+ puts ("and all valid options for @option{-mcpu=}.");
+
+ return 0;
+}
diff --git a/gcc/config/riscv/generic-vector-ooo.md b/gcc/config/riscv/generic-vector-ooo.md
index ab9e57f..773003b 100644
--- a/gcc/config/riscv/generic-vector-ooo.md
+++ b/gcc/config/riscv/generic-vector-ooo.md
@@ -17,6 +17,9 @@
;; <http://www.gnu.org/licenses/>.
;; Vector load/store
+;; The insn reservations include "generic" as we won't have a in-order
+;; generic definition for vector instructions.
+
(define_automaton "vector_ooo")
;; Separate issue queue for vector instructions.
@@ -29,119 +32,141 @@
(define_cpu_unit "vxu_ooo_multicycle" "vector_ooo")
(define_insn_reservation "vec_load" 6
- (eq_attr "type" "vlde,vldm,vlds,vldux,vldox,vldff,vldr")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vlde,vldm,vlds,vldux,vldox,vldff,vldr"))
"vxu_ooo_issue,vxu_ooo_alu")
(define_insn_reservation "vec_store" 6
- (eq_attr "type" "vste,vstm,vsts,vstux,vstox,vstr")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vste,vstm,vsts,vstux,vstox,vstr"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector segment loads/stores.
(define_insn_reservation "vec_loadstore_seg" 10
- (eq_attr "type" "vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\
- vssegte,vssegts,vssegtux,vssegtox")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\
+ vssegte,vssegts,vssegtux,vssegtox"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Regular vector operations and integer comparisons.
(define_insn_reservation "vec_alu" 3
- (eq_attr "type" "vialu,viwalu,vext,vicalu,vshift,vnshift,viminmax,vicmp,\
- vimov,vsalu,vaalu,vsshift,vnclip,vmov,vfmov,vector,\
- vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,vror,vwsll")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vialu,viwalu,vext,vicalu,vshift,vnshift,viminmax,vicmp,\
+ vimov,vsalu,vaalu,vsshift,vnclip,vmov,vfmov,vector,\
+ vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,vror,vwsll"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector float comparison, conversion etc.
(define_insn_reservation "vec_fcmp" 3
- (eq_attr "type" "vfrecp,vfminmax,vfcmp,vfsgnj,vfclass,vfcvtitof,\
- vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,\
- vfncvtftoi,vfncvtftof,vfncvtbf16,vfwcvtbf16")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfrecp,vfminmax,vfcmp,vfsgnj,vfclass,vfcvtitof,\
+ vfcvtftoi,vfwcvtitof,vfwcvtftoi,vfwcvtftof,vfncvtitof,\
+ vfncvtftoi,vfncvtftof,vfncvtbf16,vfwcvtbf16"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector integer multiplication.
(define_insn_reservation "vec_imul" 4
- (eq_attr "type" "vimul,viwmul,vimuladd,viwmuladd,vsmul,vclmul,vclmulh,\
- vghsh,vgmul")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vimul,viwmul,vimuladd,viwmuladd,vsmul,vclmul,vclmulh,\
+ vghsh,vgmul"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector float addition.
(define_insn_reservation "vec_fadd" 4
- (eq_attr "type" "vfalu,vfwalu")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfalu,vfwalu"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector float multiplication and FMA.
(define_insn_reservation "vec_fmul" 6
- (eq_attr "type" "vfmul,vfwmul,vfmuladd,vfwmuladd,vfwmaccbf16,sf_vqmacc,sf_vfnrclip")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfmul,vfwmul,vfmuladd,vfwmuladd,vfwmaccbf16,sf_vqmacc,sf_vfnrclip"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector crypto, assumed to be a generic operation for now.
(define_insn_reservation "vec_crypto" 4
- (eq_attr "type" "crypto,vclz,vctz,vcpop")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "crypto,vclz,vctz,vcpop"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector crypto, AES
(define_insn_reservation "vec_crypto_aes" 4
- (eq_attr "type" "vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector crypto, sha
(define_insn_reservation "vec_crypto_sha" 4
- (eq_attr "type" "vsha2ms,vsha2ch,vsha2cl")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vsha2ms,vsha2ch,vsha2cl"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector crypto, SM3/4
(define_insn_reservation "vec_crypto_sm" 4
- (eq_attr "type" "vsm4k,vsm4r,vsm3me,vsm3c")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vsm4k,vsm4r,vsm3me,vsm3c"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector permute.
(define_insn_reservation "vec_perm" 3
- (eq_attr "type" "vimerge,vfmerge,vslideup,vslidedown,vislide1up,\
- vislide1down,vfslide1up,vfslide1down,vgather,vcompress")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vimerge,vfmerge,vslideup,vslidedown,vislide1up,\
+ vislide1down,vfslide1up,vfslide1down,vgather,vcompress"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector reduction.
(define_insn_reservation "vec_reduction" 8
- (eq_attr "type" "vired,viwred,vfredu,vfwredu")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vired,viwred,vfredu,vfwredu"))
"vxu_ooo_issue,vxu_ooo_multicycle")
;; Vector ordered reduction, assume the latency number is for
;; a 128-bit vector. It is scaled in riscv_sched_adjust_cost
;; for larger vectors.
(define_insn_reservation "vec_ordered_reduction" 10
- (eq_attr "type" "vfredo,vfwredo")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfredo,vfwredo"))
"vxu_ooo_issue,vxu_ooo_multicycle*3")
;; Vector integer division, assume not pipelined.
(define_insn_reservation "vec_idiv" 16
- (eq_attr "type" "vidiv")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vidiv"))
"vxu_ooo_issue,vxu_ooo_multicycle*3")
;; Vector float divisions and sqrt, assume not pipelined.
(define_insn_reservation "vec_float_divsqrt" 16
- (eq_attr "type" "vfdiv,vfsqrt")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vfdiv,vfsqrt"))
"vxu_ooo_issue,vxu_ooo_multicycle*3")
;; Vector mask operations.
(define_insn_reservation "vec_mask" 2
- (eq_attr "type" "vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,\
- vfmovvf,vfmovfv")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vmalu,vmpop,vmffs,vmsfs,vmiota,vmidx,vimovvx,vimovxv,\
+ vfmovvf,vfmovfv"))
"vxu_ooo_issue,vxu_ooo_alu")
;; Vector vsetvl.
(define_insn_reservation "vec_vesetvl" 1
- (eq_attr "type" "vsetvl,vsetvl_pre")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "vsetvl,vsetvl_pre"))
"vxu_ooo_issue")
;; Vector rounding mode setters, assume pipeline barrier.
(define_insn_reservation "vec_setrm" 20
- (eq_attr "type" "wrvxrm,wrfrm")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "wrvxrm,wrfrm"))
"vxu_ooo_issue,vxu_ooo_issue*3")
;; Vector read vlen/vlenb.
(define_insn_reservation "vec_readlen" 4
- (eq_attr "type" "rdvlenb,rdvl")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "rdvlenb,rdvl"))
"vxu_ooo_issue,vxu_ooo_issue")
;; Vector sf_vcp.
(define_insn_reservation "vec_sf_vcp" 2
- (eq_attr "type" "sf_vc,sf_vc_se")
+ (and (eq_attr "tune" "generic_ooo,generic")
+ (eq_attr "type" "sf_vc,sf_vc_se"))
"vxu_ooo_issue")
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 1f9a6b5..381f96c 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -518,6 +518,10 @@
(define_predicate "vector_broadcast_mask_operand"
(ior (match_operand 0 "vector_least_significant_set_mask_operand")
+ (match_operand 0 "vector_all_trues_mask_operand")))
+
+(define_predicate "strided_broadcast_mask_operand"
+ (ior (match_operand 0 "vector_least_significant_set_mask_operand")
(ior (match_operand 0 "register_operand")
(match_operand 0 "vector_all_trues_mask_operand"))))
@@ -619,6 +623,15 @@
(define_predicate "direct_broadcast_operand"
(match_test "riscv_vector::can_be_broadcast_p (op)"))
+;; A strided broadcast is just a fallback pattern that loads from
+;; memory.
+(define_predicate "strided_broadcast_operand"
+ (match_test "riscv_vector::strided_broadcast_p (op)"))
+
+(define_predicate "any_broadcast_operand"
+ (ior (match_operand 0 "direct_broadcast_operand")
+ (match_operand 0 "strided_broadcast_operand")))
+
;; A CONST_INT operand that has exactly two bits cleared.
(define_predicate "const_nottwobits_operand"
(and (match_code "const_int")
diff --git a/gcc/config/riscv/riscv-ext.def b/gcc/config/riscv/riscv-ext.def
index 6fc6d38..09f18ad 100644
--- a/gcc/config/riscv/riscv-ext.def
+++ b/gcc/config/riscv/riscv-ext.def
@@ -80,8 +80,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({}),
/* SUPPORTED_VERSIONS */ ({{2, 0}}),
/* FLAG_GROUP */ base,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 0,
+ /* BITMASK_BIT_POSITION*/ 4,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -190,8 +190,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({"zba", "zbb", "zbs"}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ base,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 0,
+ /* BITMASK_BIT_POSITION*/ 1,
/* EXTRA_EXTENSION_FLAGS */ EXT_FLAG_MACRO)
DEFINE_RISCV_EXT(
@@ -216,8 +216,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ base,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 0,
+ /* BITMASK_BIT_POSITION*/ 7,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -398,8 +398,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({}),
/* SUPPORTED_VERSIONS */ ({{2, 0}}),
/* FLAG_GROUP */ zi,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 1,
+ /* BITMASK_BIT_POSITION*/ 11,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -464,7 +464,7 @@ DEFINE_RISCV_EXT(
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ zi,
/* BITMASK_GROUP_ID */ 1,
- /* BITMASK_BIT_POSITION*/ 1,
+ /* BITMASK_BIT_POSITION*/ 8,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -476,8 +476,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ zm,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 1,
+ /* BITMASK_BIT_POSITION*/ 12,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -787,8 +787,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({"zca"}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ zc,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 1,
+ /* BITMASK_BIT_POSITION*/ 10,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
@@ -813,8 +813,8 @@ DEFINE_RISCV_EXT(
/* DEP_EXTS */ ({"zca", "zilsd"}),
/* SUPPORTED_VERSIONS */ ({{1, 0}}),
/* FLAG_GROUP */ zc,
- /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
- /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+ /* BITMASK_GROUP_ID */ 1,
+ /* BITMASK_BIT_POSITION*/ 9,
/* EXTRA_EXTENSION_FLAGS */ 0)
DEFINE_RISCV_EXT(
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index a41c4c2..539321f 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -414,8 +414,14 @@ enum insn_flags : unsigned int
/* Means INSN has VXRM operand and the value is VXRM_RNU. */
VXRM_RNU_P = 1 << 20,
+ /* Means INSN has VXRM operand and the value is VXRM_RNE. */
+ VXRM_RNE_P = 1 << 21,
+
/* Means INSN has VXRM operand and the value is VXRM_RDN. */
- VXRM_RDN_P = 1 << 21,
+ VXRM_RDN_P = 1 << 22,
+
+ /* Means INSN has VXRM operand and the value is VXRM_ROD. */
+ VXRM_ROD_P = 1 << 23,
};
enum insn_type : unsigned int
@@ -477,7 +483,9 @@ enum insn_type : unsigned int
BINARY_OP_TUMA = __MASK_OP_TUMA | BINARY_OP_P,
BINARY_OP_FRM_DYN = BINARY_OP | FRM_DYN_P,
BINARY_OP_VXRM_RNU = BINARY_OP | VXRM_RNU_P,
+ BINARY_OP_VXRM_RNE = BINARY_OP | VXRM_RNE_P,
BINARY_OP_VXRM_RDN = BINARY_OP | VXRM_RDN_P,
+ BINARY_OP_VXRM_ROD = BINARY_OP | VXRM_ROD_P,
/* Ternary operator. Always have real merge operand. */
TERNARY_OP = HAS_DEST_P | HAS_MASK_P | USE_ALL_TRUES_MASK_P | HAS_MERGE_P
@@ -672,6 +680,8 @@ void expand_vec_oct_sstrunc (rtx, rtx, machine_mode, machine_mode,
machine_mode);
void expand_vx_binary_vec_dup_vec (rtx, rtx, rtx, rtx_code, machine_mode);
void expand_vx_binary_vec_vec_dup (rtx, rtx, rtx, rtx_code, machine_mode);
+void expand_vx_binary_vxrm_vec_vec_dup (rtx, rtx, rtx, int, int, machine_mode);
+void expand_vx_binary_vxrm_vec_dup_vec (rtx, rtx, rtx, int, int, machine_mode);
#endif
bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
bool, void (*)(rtx *, rtx), enum avl_type);
@@ -695,6 +705,9 @@ bool expand_block_move (rtx, rtx, rtx, bool);
machine_mode preferred_simd_mode (scalar_mode);
machine_mode get_mask_mode (machine_mode);
void expand_vec_series (rtx, rtx, rtx, rtx = 0);
+void expand_broadcast (machine_mode, rtx *, rtx = 0);
+void expand_set_first (machine_mode, rtx *, rtx = 0);
+void expand_set_first_tu (machine_mode, rtx *, rtx = 0);
void expand_vec_init (rtx, rtx);
void expand_vec_perm (rtx, rtx, rtx, rtx);
void expand_select_vl (rtx *);
@@ -762,6 +775,7 @@ enum vlmul_type get_vlmul (rtx_insn *);
int count_regno_occurrences (rtx_insn *, unsigned int);
bool imm_avl_p (machine_mode);
bool can_be_broadcast_p (rtx);
+bool strided_broadcast_p (rtx);
bool gather_scatter_valid_offset_p (machine_mode);
HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 9080189..61c4a09 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -1625,16 +1625,14 @@ expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in)
Otherwise, use a predicated store. */
if (known_eq (GET_MODE_SIZE (info.vmode), INTVAL (info.avl)))
{
- emit_vlmax_insn (code_for_pred_broadcast (info.vmode), UNARY_OP,
- broadcast_ops);
+ riscv_vector::expand_broadcast (info.vmode, broadcast_ops);
emit_move_insn (dst, fill_value);
}
else
{
if (!satisfies_constraint_vl (info.avl))
info.avl = force_reg (Pmode, info.avl);
- emit_nonvlmax_insn (code_for_pred_broadcast (info.vmode),
- riscv_vector::UNARY_OP, broadcast_ops, info.avl);
+ riscv_vector::expand_broadcast (info.vmode, broadcast_ops, info.avl);
machine_mode mask_mode
= riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (info.vmode))
.require ();
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 242ac08..c9c8328 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -351,9 +351,12 @@ public:
add_rounding_mode_operand (FRM_RNE);
else if (m_insn_flags & VXRM_RNU_P)
add_rounding_mode_operand (VXRM_RNU);
+ else if (m_insn_flags & VXRM_RNE_P)
+ add_rounding_mode_operand (VXRM_RNE);
else if (m_insn_flags & VXRM_RDN_P)
add_rounding_mode_operand (VXRM_RDN);
-
+ else if (m_insn_flags & VXRM_ROD_P)
+ add_rounding_mode_operand (VXRM_ROD);
if (insn_data[(int) icode].n_operands != m_opno)
internal_error ("invalid number of operands for insn %s, "
@@ -1190,6 +1193,59 @@ expand_vector_init_trailing_same_elem (rtx target,
return false;
}
+/* Helper function to emit a vmv.vx/vi and float variants.
+ If VL is not given a VLMAX insn will be emitted, otherwise
+ a non-VLMAX insn with length VL.
+ If the value to be broadcast is not suitable for vmv.vx
+ fall back to a vlse with zero stride. This itself has a
+ fallback if the uarch prefers not to use a strided load
+ for broadcast. */
+
+void
+expand_broadcast (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[1];
+ avl_type type = vl ? NONVLMAX : VLMAX;
+ if (can_be_broadcast_p (elt))
+ emit_avltype_insn (code_for_pred_broadcast (mode), UNARY_OP, ops,
+ type, vl);
+ else
+ emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+ UNARY_OP, ops, type, vl);
+}
+
+/* Similar to expand_broadcast but emits a vmv.s.x/vfmv.s.f instead. */
+
+void
+expand_set_first (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[1];
+ avl_type type = vl ? NONVLMAX : VLMAX;
+ if (can_be_broadcast_p (elt))
+ emit_avltype_insn (code_for_pred_broadcast (mode),
+ SCALAR_MOVE_OP, ops, type, vl);
+ else
+ emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+ SCALAR_MOVE_OP, ops, type, vl);
+}
+
+/* Similar to expand_set_first but keeping the tail elements
+ unchanged (TU) */
+
+void
+expand_set_first_tu (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[2];
+ if (!vl)
+ vl = const1_rtx;
+ if (can_be_broadcast_p (elt))
+ emit_nonvlmax_insn (code_for_pred_broadcast (mode),
+ SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+ else
+ emit_nonvlmax_insn (code_for_pred_strided_broadcast (mode),
+ SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+}
+
static void
expand_const_vec_duplicate (rtx target, rtx src, rtx elt)
{
@@ -1226,7 +1282,7 @@ expand_const_vec_duplicate (rtx target, rtx src, rtx elt)
if (lra_in_progress)
{
rtx ops[] = {result, elt};
- emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+ expand_broadcast (mode, ops);
}
else
{
@@ -1278,8 +1334,7 @@ expand_const_vector_duplicate_repeating (rtx target, rvv_builder *builder)
{
dup = gen_reg_rtx (builder->new_mode ());
rtx ops[] = {dup, ele};
- emit_vlmax_insn (code_for_pred_broadcast (builder->new_mode ()),
- UNARY_OP, ops);
+ expand_broadcast (builder->new_mode (), ops);
}
else
dup = expand_vector_broadcast (builder->new_mode (), ele);
@@ -1322,8 +1377,7 @@ expand_const_vector_duplicate_default (rtx target, rvv_builder *builder)
rtx tmp1 = gen_reg_rtx (builder->mode ());
rtx dup_ops[] = {tmp1, builder->elt (0)};
- emit_vlmax_insn (code_for_pred_broadcast (builder->mode ()), UNARY_OP,
- dup_ops);
+ expand_broadcast (builder->mode (), dup_ops);
for (unsigned int i = 1; i < builder->npatterns (); i++)
{
@@ -2136,18 +2190,32 @@ has_vi_variant_p (rtx_code code, rtx x)
}
}
+/* This is a helper for binary ops with DImode scalar operands that are
+ broadcast (like vadd.vx v1, a1).
+ Instead of having similar code for all the expanders this function
+ unifies the handling. For 64-bit targets all we do is choose
+ between the vi variant (if available) and the register variant.
+ For 32-bit targets we either create the sign-extending variant
+ of vop.vx (when the immediate fits 32 bits) or emit a vector
+ broadcast of the 64-bit register/immediate and switch to a
+ vop.vv (replacing the scalar op with the broadcast vector. */
+
bool
sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
machine_mode vector_mode, bool has_vi_variant_p,
void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
{
machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
+
+ /* If the scalar broadcast op fits an immediate, use the
+ vop.vi variant if there is one. */
if (has_vi_variant_p)
{
*scalar_op = force_reg (scalar_mode, *scalar_op);
return false;
}
+ /* On a 64-bit target we can always use the vop.vx variant. */
if (TARGET_64BIT)
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
@@ -2155,6 +2223,8 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
return false;
}
+ /* For 32 bit and if there is no vop.vi variant for a 32-bit immediate
+ we need to use the sign-extending (SI -> DI) vop.vx variants. */
if (immediate_operand (*scalar_op, Pmode))
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
@@ -2164,40 +2234,29 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
return false;
}
- bool avoid_strided_broadcast = false;
+ /* Now we're left with a 64-bit immediate or a register.
+ We cannot use a vop.vx variant but must broadcast the value first
+ and switch to a vop.vv variant.
+ Broadcast can either be done via vlse64.v v1, reg, zero
+ or by loading one 64-bit element (vle64.v) and using a
+ broadcast vrgather.vi. This is decided when splitting
+ the strided broadcast insn. */
+ gcc_assert (!TARGET_64BIT
+ && (CONST_INT_P (*scalar_op)
+ || register_operand (*scalar_op, scalar_mode)));
+
if (CONST_INT_P (*scalar_op))
{
if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
- {
- if (strided_load_broadcast_p ())
- *scalar_op = force_const_mem (scalar_mode, *scalar_op);
- else
- avoid_strided_broadcast = true;
- }
+ *scalar_op = force_const_mem (scalar_mode, *scalar_op);
else
*scalar_op = force_reg (scalar_mode, *scalar_op);
}
rtx tmp = gen_reg_rtx (vector_mode);
- if (!avoid_strided_broadcast)
- {
- rtx ops[] = {tmp, *scalar_op};
- emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
- type, vl);
- }
- else
- {
- /* Load scalar as V1DI and broadcast via vrgather.vi. */
- rtx tmp1 = gen_reg_rtx (V1DImode);
- emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op,
- scalar_mode));
- tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode);
-
- rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)};
- emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode),
- BINARY_OP, ops);
- }
-
+ rtx ops[] = {tmp, *scalar_op};
+ emit_avltype_insn (code_for_pred_strided_broadcast (vector_mode),
+ UNARY_OP, ops, type, vl);
emit_vector_func (operands, tmp);
return true;
@@ -2591,8 +2650,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
/* Step 1: Broadcast the first pattern. */
rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
- emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
- UNARY_OP, ops);
+ expand_broadcast (builder.mode (), ops);
/* Step 2: Merge the rest iteration of pattern. */
for (unsigned int i = 1; i < builder.npatterns (); i++)
{
@@ -2605,8 +2663,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */
{
rtx ops[] = {dup, merge_mask};
- emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
- SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
+ expand_set_first (GET_MODE (dup), ops);
}
else /* vmv.v.x. */
{
@@ -2614,8 +2671,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
Pmode);
- emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
- ops, vl);
+ expand_broadcast (mask_int_mode, ops, vl);
}
emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
@@ -4706,20 +4762,20 @@ expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe,
rtx m1_tmp = gen_reg_rtx (m1_mode);
rtx scalar_move_ops[] = {m1_tmp, init};
- insn_code icode = code_for_pred_broadcast (m1_mode);
if (need_mask_operand_p (insn_flags))
{
if (need_vl0_safe)
- emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx);
+ expand_set_first (m1_mode, scalar_move_ops, const1_rtx);
else
- emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op);
+ expand_set_first (m1_mode, scalar_move_ops, vl_op);
}
else
- emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
+ expand_set_first (m1_mode, scalar_move_ops);
rtx m1_tmp2 = gen_reg_rtx (m1_mode);
rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
+ insn_code icode;
if (need_vl0_safe)
icode = code_for_pred (unspec_for_vl0_safe, vmode);
else
@@ -5597,6 +5653,82 @@ expand_vx_binary_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2,
emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops);
}
+static enum insn_type
+get_insn_type_by_vxrm_val (int vxrm_val)
+{
+ enum insn_type itype;
+
+ switch (vxrm_val)
+ {
+ case VXRM_RNU:
+ itype = BINARY_OP_VXRM_RNU;
+ break;
+ case VXRM_RNE:
+ itype = BINARY_OP_VXRM_RNE;
+ break;
+ case VXRM_RDN:
+ itype = BINARY_OP_VXRM_RDN;
+ break;
+ case VXRM_ROD:
+ itype = BINARY_OP_VXRM_ROD;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ return itype;
+}
+
+/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x))
+ and its' vxrm value. Aka the second op comes from the vec_duplicate,
+ and the first op is the vector reg. */
+
+void
+expand_vx_binary_vxrm_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2, int unspec,
+ int vxrm_val, machine_mode mode)
+{
+ enum insn_code icode;
+ enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val);
+ rtx ops[] = {op_0, op_1, op_2};
+
+ switch (unspec)
+ {
+ case UNSPEC_VAADD:
+ case UNSPEC_VAADDU:
+ icode = code_for_pred_scalar (unspec, mode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_vlmax_insn (icode, itype, ops);
+}
+
+/* Expand the binary vx combine with the format like v2 = vop(vec_dup(x), v1)
+ and its' vxrm value. Aka the second op comes from the vec_duplicate,
+ and the first op is the vector reg. */
+
+void
+expand_vx_binary_vxrm_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, int unspec,
+ int vxrm_val, machine_mode mode)
+{
+ enum insn_code icode;
+ enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val);
+ rtx ops[] = {op_0, op_1, op_2};
+
+ switch (unspec)
+ {
+ case UNSPEC_VAADD:
+ case UNSPEC_VAADDU:
+ icode = code_for_pred_scalar (unspec, mode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_vlmax_insn (icode, itype, ops);
+}
+
/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)).
Aka the second op comes from the vec_duplicate, and the first op is
the vector reg. */
@@ -5808,25 +5940,84 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
return count;
}
-/* Return true if the OP can be directly broadcast. */
+/* Return true if the OP can be broadcast with a
+ v[f]mv.v.[xif] instruction. */
+
bool
can_be_broadcast_p (rtx op)
{
machine_mode mode = GET_MODE (op);
- /* We don't allow RA (register allocation) reload generate
- (vec_duplicate:DI reg) in RV32 system wheras we allow
- (vec_duplicate:DI mem) in RV32 system. */
- if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
- && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
- && !satisfies_constraint_Wdm (op))
+
+ /* Zero always works and we can always put an immediate into a
+ register.
+ What's tricky is that for an immediate we don't know the
+ register's mode it will end up in, i.e. what element size
+ we want to broadcast. So even if the immediate is small it might
+ still end up in a DImode register that we cannot broadcast.
+ vmv.s.x, i.e. a single-element set can handle this, though,
+ because it implicitly sign-extends to SEW. */
+ if (rtx_equal_p (op, CONST0_RTX (mode))
+ || const_int_operand (op, Xmode))
+ return true;
+
+ /* Do not accept DImode broadcasts on !TARGET_64BIT. Those
+ are handled by strided broadcast. */
+ if (INTEGRAL_MODE_P (mode)
+ && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
+ return false;
+
+ /* Non-register operands that can be forced into a register we can
+ handle. These don't need to use strided broadcast. */
+ if (INTEGRAL_MODE_P (mode)
+ && (memory_operand (op, mode) || CONST_POLY_INT_P (op))
+ && can_create_pseudo_p ())
+ return true;
+
+ /* Likewise, do not accept HFmode broadcast if we don't have
+ vfmv.v.f for 16-bit registers available. */
+ if (mode == HFmode && !TARGET_ZVFH)
+ return false;
+
+ /* Same for float, just that we can always handle 64-bit doubles
+ even on !TARGET_64BIT. We have ruled out 16-bit HF already
+ above. */
+ if (FLOAT_MODE_P (mode)
+ && (memory_operand (op, mode) || CONSTANT_P (op))
+ && can_create_pseudo_p ())
+ return true;
+
+ /* After excluding all the cases we cannot handle the register types
+ that remain can always be broadcast. */
+ if (register_operand (op, mode))
+ return true;
+
+ return false;
+}
+
+/* Returns true for all operands that cannot use vmv.vx, vfmv.vf,
+ vmv.s.x, or vfmv.s.f but rather need to go via memory. */
+
+bool
+strided_broadcast_p (rtx op)
+{
+ machine_mode mode = GET_MODE (op);
+ if (!memory_operand (op, mode)
+ && !register_operand (op, mode)
+ && !rtx_equal_p (op, CONST0_RTX (mode))
+ && !const_int_operand (op, mode))
return false;
- if (satisfies_constraint_K (op) || register_operand (op, mode)
- || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op))
- || rtx_equal_p (op, CONST0_RTX (mode)))
+ /* !TARGET64_BIT does not have a vmv.v.x/vmv.s.x for 64-bit
+ DImode elements. */
+ if (INTEGRAL_MODE_P (mode)
+ && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
+ return true;
+
+ /* Zvfhmin does not have a vfmv.v.f/vfmv.s.f. for 16-bit elements. */
+ if (!TARGET_ZVFH && mode == HFmode)
return true;
- return can_create_pseudo_p () && nonmemory_operand (op, mode);
+ return false;
}
void
@@ -5941,7 +6132,10 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
return false;
}
-/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */
+/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.
+ That's the case if we're dealing with a scalar broadcast that
+ has VL = 1. */
+
bool
splat_to_scalar_move_p (rtx *ops)
{
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index bf5172c..7e4d396 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -643,7 +643,8 @@ public:
return e.use_exact_insn (code_for_pred_mov (e.vector_mode ()));
case OP_TYPE_x:
case OP_TYPE_f:
- return e.use_exact_insn (code_for_pred_broadcast (e.vector_mode ()));
+ return e.use_scalar_broadcast_insn
+ (code_for_pred_broadcast (e.vector_mode ()));
default:
gcc_unreachable ();
}
diff --git a/gcc/config/riscv/riscv-vector-builtins.cc b/gcc/config/riscv/riscv-vector-builtins.cc
index 8810af0..0db7549 100644
--- a/gcc/config/riscv/riscv-vector-builtins.cc
+++ b/gcc/config/riscv/riscv-vector-builtins.cc
@@ -4753,7 +4753,10 @@ function_expander::use_ternop_insn (bool vd_accum_p, insn_code icode)
}
/* Implement the call using instruction ICODE, with a 1:1 mapping between
- arguments and input operands. */
+ arguments and input operands.
+ There are operands that cannot be broadcast using v[f]mv. In that case
+ we switch to a strided broadcast. */
+
rtx
function_expander::use_widen_ternop_insn (insn_code icode)
{
@@ -4794,7 +4797,10 @@ function_expander::use_widen_ternop_insn (insn_code icode)
}
/* Implement the call using instruction ICODE, with a 1:1 mapping between
- arguments and input operands. */
+ arguments and input operands.
+ There are operands that cannot be broadcast using v[f]mv. In that case
+ we switch to a strided broadcast. */
+
rtx
function_expander::use_scalar_move_insn (insn_code icode)
{
@@ -4812,6 +4818,37 @@ function_expander::use_scalar_move_insn (insn_code icode)
for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
add_input_operand (argno);
+ if (!can_be_broadcast_p (m_ops[3].value))
+ icode = code_for_pred_strided_broadcast (vector_mode ());
+
+ add_input_operand (Pmode, get_tail_policy_for_pred (pred));
+ add_input_operand (Pmode, get_mask_policy_for_pred (pred));
+ add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
+ return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, with a 1:1 mapping between
+ arguments and input operands. */
+rtx
+function_expander::use_scalar_broadcast_insn (insn_code icode)
+{
+ machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+
+ /* Record the offset to get the argument. */
+ int arg_offset = 0;
+ add_all_one_mask_operand (mask_mode ());
+
+ if (use_real_merge_p (pred))
+ add_input_operand (arg_offset++);
+ else
+ add_vundef_operand (mode);
+
+ for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
+ add_input_operand (argno);
+
+ if (!can_be_broadcast_p (m_ops[3].value))
+ icode = code_for_pred_strided_broadcast (vector_mode ());
+
add_input_operand (Pmode, get_tail_policy_for_pred (pred));
add_input_operand (Pmode, get_mask_policy_for_pred (pred));
add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
diff --git a/gcc/config/riscv/riscv-vector-builtins.h b/gcc/config/riscv/riscv-vector-builtins.h
index 1f2587a..86d8115 100644
--- a/gcc/config/riscv/riscv-vector-builtins.h
+++ b/gcc/config/riscv/riscv-vector-builtins.h
@@ -497,6 +497,7 @@ public:
rtx use_ternop_insn (bool, insn_code);
rtx use_widen_ternop_insn (insn_code);
rtx use_scalar_move_insn (insn_code);
+ rtx use_scalar_broadcast_insn (insn_code);
rtx generate_insn (insn_code);
/* The function call expression. */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 4d8170d..1c6bc25 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -178,8 +178,8 @@ get_live_range (hash_map<tree, pair> *live_ranges, tree arg)
STMT 5 (be vectorized) -- point 2
...
*/
-static void
-compute_local_program_points (
+void
+costs::compute_local_program_points (
vec_info *vinfo,
hash_map<basic_block, vec<stmt_point>> &program_points_per_bb)
{
@@ -274,14 +274,14 @@ loop_invariant_op_p (class loop *loop,
/* Return true if the variable should be counted into liveness. */
static bool
-variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var,
- bool lhs_p)
+variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info,
+ slp_tree node, tree var, bool lhs_p)
{
if (!var)
return false;
gimple *stmt = STMT_VINFO_STMT (stmt_info);
- enum stmt_vec_info_type type
- = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+ stmt_info = vect_stmt_to_vectorize (stmt_info);
+ enum stmt_vec_info_type type = SLP_TREE_TYPE (node);
if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
{
if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE
@@ -357,8 +357,8 @@ variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var,
The live range of SSA 1 is [1, 3] in bb 2.
The live range of SSA 2 is [0, 4] in bb 3. */
-static machine_mode
-compute_local_live_ranges (
+machine_mode
+costs::compute_local_live_ranges (
loop_vec_info loop_vinfo,
const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb)
@@ -388,8 +388,11 @@ compute_local_live_ranges (
unsigned int point = program_point.point;
gimple *stmt = program_point.stmt;
tree lhs = gimple_get_lhs (stmt);
- if (variable_vectorized_p (loop, program_point.stmt_info, lhs,
- true))
+ slp_tree *node = vinfo_slp_map.get (program_point.stmt_info);
+ if (!node)
+ continue;
+ if (variable_vectorized_p (loop, program_point.stmt_info,
+ *node, lhs, true))
{
biggest_mode = get_biggest_mode (biggest_mode,
TYPE_MODE (TREE_TYPE (lhs)));
@@ -406,8 +409,8 @@ compute_local_live_ranges (
for (i = 0; i < gimple_num_args (stmt); i++)
{
tree var = gimple_arg (stmt, i);
- if (variable_vectorized_p (loop, program_point.stmt_info, var,
- false))
+ if (variable_vectorized_p (loop, program_point.stmt_info,
+ *node, var, false))
{
biggest_mode
= get_biggest_mode (biggest_mode,
@@ -597,11 +600,11 @@ get_store_value (gimple *stmt)
}
/* Return true if additional vector vars needed. */
-static bool
-need_additional_vector_vars_p (stmt_vec_info stmt_info)
+bool
+costs::need_additional_vector_vars_p (stmt_vec_info stmt_info,
+ slp_tree node)
{
- enum stmt_vec_info_type type
- = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+ enum stmt_vec_info_type type = SLP_TREE_TYPE (node);
if (type == load_vec_info_type || type == store_vec_info_type)
{
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
@@ -657,8 +660,8 @@ compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode)
Then, after this function, we update SSA 1 live range in bb 2
into [2, 4] since SSA 1 is live out into bb 3. */
-static void
-update_local_live_ranges (
+void
+costs::update_local_live_ranges (
vec_info *vinfo,
hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb,
@@ -685,8 +688,13 @@ update_local_live_ranges (
{
gphi *phi = psi.phi ();
stmt_vec_info stmt_info = vinfo->lookup_stmt (phi);
- if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
- == undef_vec_info_type)
+ stmt_info = vect_stmt_to_vectorize (stmt_info);
+ slp_tree *node = vinfo_slp_map.get (stmt_info);
+
+ if (!node)
+ continue;
+
+ if (SLP_TREE_TYPE (*node) == undef_vec_info_type)
continue;
for (j = 0; j < gimple_phi_num_args (phi); j++)
@@ -761,9 +769,12 @@ update_local_live_ranges (
if (!is_gimple_assign_or_call (gsi_stmt (si)))
continue;
stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
- enum stmt_vec_info_type type
- = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
- if (need_additional_vector_vars_p (stmt_info))
+ stmt_info = vect_stmt_to_vectorize (stmt_info);
+ slp_tree *node = vinfo_slp_map.get (stmt_info);
+ if (!node)
+ continue;
+ enum stmt_vec_info_type type = SLP_TREE_TYPE (*node);
+ if (need_additional_vector_vars_p (stmt_info, *node))
{
/* For non-adjacent load/store STMT, we will potentially
convert it into:
@@ -816,8 +827,8 @@ update_local_live_ranges (
}
/* Compute the maximum live V_REGS. */
-static bool
-has_unexpected_spills_p (loop_vec_info loop_vinfo)
+bool
+costs::has_unexpected_spills_p (loop_vec_info loop_vinfo)
{
/* Compute local program points.
It's a fast and effective computation. */
@@ -899,7 +910,11 @@ costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
/* Detect whether we're vectorizing for VLA and should apply the unrolling
heuristic described above m_unrolled_vls_niters. */
record_potential_vls_unrolling (loop_vinfo);
+}
+void
+costs::record_lmul_spills (loop_vec_info loop_vinfo)
+{
/* Detect whether the LOOP has unexpected spills. */
record_potential_unexpected_spills (loop_vinfo);
}
@@ -1239,8 +1254,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
int stmt_cost
= targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign);
+ if (stmt_info && node)
+ vinfo_slp_map.put (stmt_info, node);
+
/* Do one-time initialization based on the vinfo. */
loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
+
if (!m_analyzed_vinfo)
{
if (loop_vinfo)
@@ -1326,6 +1345,8 @@ costs::finish_cost (const vector_costs *scalar_costs)
{
if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
{
+ record_lmul_spills (loop_vinfo);
+
adjust_vect_cost_per_loop (loop_vinfo);
}
vector_costs::finish_cost (scalar_costs);
diff --git a/gcc/config/riscv/riscv-vector-costs.h b/gcc/config/riscv/riscv-vector-costs.h
index de546a6..b84ceb1 100644
--- a/gcc/config/riscv/riscv-vector-costs.h
+++ b/gcc/config/riscv/riscv-vector-costs.h
@@ -91,7 +91,10 @@ private:
typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
hash_set <tree_pair_hash> memrefs;
+ hash_map <stmt_vec_info, slp_tree> vinfo_slp_map;
+
void analyze_loop_vinfo (loop_vec_info);
+ void record_lmul_spills (loop_vec_info loop_vinfo);
void record_potential_vls_unrolling (loop_vec_info);
bool prefer_unrolled_loop () const;
@@ -103,6 +106,19 @@ private:
bool m_has_unexpected_spills_p = false;
void record_potential_unexpected_spills (loop_vec_info);
+ void compute_local_program_points (vec_info *,
+ hash_map<basic_block, vec<stmt_point>> &);
+ void update_local_live_ranges (vec_info *,
+ hash_map<basic_block, vec<stmt_point>> &,
+ hash_map<basic_block, hash_map<tree, pair>> &,
+ machine_mode *);
+ machine_mode compute_local_live_ranges
+ (loop_vec_info, const hash_map<basic_block, vec<stmt_point>> &,
+ hash_map<basic_block, hash_map<tree, pair>> &);
+
+ bool has_unexpected_spills_p (loop_vec_info);
+ bool need_additional_vector_vars_p (stmt_vec_info, slp_tree);
+
void adjust_vect_cost_per_loop (loop_vec_info);
unsigned adjust_stmt_cost (enum vect_cost_for_stmt kind,
loop_vec_info,
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 3324819..0a9fcef 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -170,7 +170,7 @@ struct GTY(()) riscv_frame_info {
};
enum riscv_privilege_levels {
- UNKNOWN_MODE, USER_MODE, SUPERVISOR_MODE, MACHINE_MODE
+ UNKNOWN_MODE, SUPERVISOR_MODE, MACHINE_MODE, RNMI_MODE
};
struct GTY(()) mode_switching_info {
@@ -4040,6 +4040,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
switch (XINT (op, 1))
{
case UNSPEC_VAADDU:
+ case UNSPEC_VAADD:
*total
= get_vector_binary_rtx_cost (op, scalar2vr_cost);
break;
@@ -6924,12 +6925,18 @@ riscv_handle_type_attribute (tree *node ATTRIBUTE_UNUSED, tree name, tree args,
}
string = TREE_STRING_POINTER (cst);
- if (strcmp (string, "user") && strcmp (string, "supervisor")
- && strcmp (string, "machine"))
+ if (!strcmp (string, "rnmi") && !TARGET_SMRNMI)
+ {
+ error ("attribute 'rnmi' requires the Smrnmi ISA extension");
+ *no_add_attrs = true;
+ }
+ else if (strcmp (string, "supervisor")
+ && strcmp (string, "machine")
+ && strcmp (string, "rnmi"))
{
warning (OPT_Wattributes,
- "argument to %qE attribute is not %<\"user\"%>, %<\"supervisor\"%>, "
- "or %<\"machine\"%>", name);
+ "argument to %qE attribute is not %<\"supervisor\"%>, "
+ "%<\"machine\"%>, or %<\"rnmi\"%>", name);
*no_add_attrs = true;
}
}
@@ -9710,12 +9717,12 @@ riscv_expand_epilogue (int style)
if (th_int_mask && TH_INT_INTERRUPT (cfun))
emit_jump_insn (gen_th_int_pop ());
- else if (mode == MACHINE_MODE)
- emit_jump_insn (gen_riscv_mret ());
else if (mode == SUPERVISOR_MODE)
emit_jump_insn (gen_riscv_sret ());
- else
- emit_jump_insn (gen_riscv_uret ());
+ else if (mode == RNMI_MODE)
+ emit_jump_insn (gen_riscv_mnret ());
+ else /* Must be MACHINE_MODE. */
+ emit_jump_insn (gen_riscv_mret ());
}
else if (style != SIBCALL_RETURN)
{
@@ -12057,10 +12064,10 @@ riscv_get_interrupt_type (tree decl)
{
const char *string = TREE_STRING_POINTER (TREE_VALUE (attr_args));
- if (!strcmp (string, "user"))
- return USER_MODE;
- else if (!strcmp (string, "supervisor"))
+ if (!strcmp (string, "supervisor"))
return SUPERVISOR_MODE;
+ else if (!strcmp (string, "rnmi"))
+ return RNMI_MODE;
else /* Must be "machine". */
return MACHINE_MODE;
}
@@ -12677,14 +12684,31 @@ riscv_estimated_poly_value (poly_int64 val,
/* Return true if the vector misalignment factor is supported by the
target. */
bool
-riscv_support_vector_misalignment (machine_mode mode,
- const_tree type ATTRIBUTE_UNUSED,
- int misalignment,
- bool is_packed ATTRIBUTE_UNUSED)
+riscv_support_vector_misalignment (machine_mode mode, const_tree type,
+ int misalignment, bool is_packed,
+ bool is_gather_scatter)
{
- /* Depend on movmisalign pattern. */
+ /* IS_PACKED is true if the corresponding scalar element is not naturally
+ aligned. If the misalignment is unknown and the the access is packed
+ we defer to the default hook which will check if movmisalign is present.
+ Movmisalign, in turn, depends on TARGET_VECTOR_MISALIGN_SUPPORTED. */
+ if (misalignment == DR_MISALIGNMENT_UNKNOWN)
+ {
+ if (!is_packed)
+ return true;
+ }
+ else
+ {
+ /* If we know that misalignment is a multiple of the element size, we're
+ good. */
+ if (misalignment % TYPE_ALIGN_UNIT (type) == 0)
+ return true;
+ }
+
+ /* Otherwise fall back to movmisalign again. */
return default_builtin_support_vector_misalignment (mode, type, misalignment,
- is_packed);
+ is_packed,
+ is_gather_scatter);
}
/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index c3b504d..578dd43 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -120,7 +120,7 @@
;; Interrupt handler instructions.
UNSPECV_MRET
UNSPECV_SRET
- UNSPECV_URET
+ UNSPECV_MNRET
;; Blockage and synchronization.
UNSPECV_BLOCKAGE
@@ -4166,11 +4166,11 @@
"sret"
[(set_attr "type" "ret")])
-(define_insn "riscv_uret"
+(define_insn "riscv_mnret"
[(return)
- (unspec_volatile [(const_int 0)] UNSPECV_URET)]
- ""
- "uret"
+ (unspec_volatile [(const_int 0)] UNSPECV_MNRET)]
+ "TARGET_SMRNMI"
+ "mnret"
[(set_attr "type" "ret")])
(define_insn "stack_tie<mode>"
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 7aac56a..a7eaa8b 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -229,8 +229,41 @@ s-riscv-ext.texi: build/gen-riscv-ext-texi$(build_exeext)
$(SHELL) $(srcdir)/../move-if-change tmp-riscv-ext.texi $(srcdir)/doc/riscv-ext.texi
$(STAMP) s-riscv-ext.texi
-# Run `riscv-regen' after you changed or added anything from riscv-ext*.def
+RISCV_CORES_DEFS = \
+ $(srcdir)/config/riscv/riscv-cores.def
+
+build/gen-riscv-mtune-texi.o: $(srcdir)/config/riscv/gen-riscv-mtune-texi.cc \
+ $(RISCV_CORES_DEFS)
+ $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@
+
+build/gen-riscv-mcpu-texi.o: $(srcdir)/config/riscv/gen-riscv-mcpu-texi.cc \
+ $(RISCV_CORES_DEFS)
+ $(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@
+
+build/gen-riscv-mtune-texi$(build_exeext): build/gen-riscv-mtune-texi.o
+ $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $<
+
+build/gen-riscv-mcpu-texi$(build_exeext): build/gen-riscv-mcpu-texi.o
+ $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $<
+
+$(srcdir)/doc/riscv-mtune.texi: $(RISCV_CORES_DEFS)
+$(srcdir)/doc/riscv-mtune.texi: s-riscv-mtune.texi ; @true
+
+$(srcdir)/doc/riscv-mcpu.texi: $(RISCV_CORES_DEFS)
+$(srcdir)/doc/riscv-mcpu.texi: s-riscv-mcpu.texi ; @true
+
+s-riscv-mtune.texi: build/gen-riscv-mtune-texi$(build_exeext)
+ $(RUN_GEN) build/gen-riscv-mtune-texi$(build_exeext) > tmp-riscv-mtune.texi
+ $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mtune.texi $(srcdir)/doc/riscv-mtune.texi
+ $(STAMP) s-riscv-mtune.texi
+
+s-riscv-mcpu.texi: build/gen-riscv-mcpu-texi$(build_exeext)
+ $(RUN_GEN) build/gen-riscv-mcpu-texi$(build_exeext) > tmp-riscv-mcpu.texi
+ $(SHELL) $(srcdir)/../move-if-change tmp-riscv-mcpu.texi $(srcdir)/doc/riscv-mcpu.texi
+ $(STAMP) s-riscv-mcpu.texi
+
+# Run `riscv-regen' after you changed or added anything from riscv-ext*.def and riscv-cores*.def
.PHONY: riscv-regen
-riscv-regen: s-riscv-ext.texi s-riscv-ext.opt
+riscv-regen: s-riscv-ext.texi s-riscv-ext.opt s-riscv-mtune.texi s-riscv-mcpu.texi
diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md
index 5f6cc42..aa3b6fb 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -4013,6 +4013,14 @@
UNSPEC_VASUBU UNSPEC_VASUB UNSPEC_VSMUL
UNSPEC_VSSRL UNSPEC_VSSRA])
+(define_int_iterator VSAT_VX_OP_V_VDUP [
+ UNSPEC_VAADDU UNSPEC_VAADD
+])
+
+(define_int_iterator VSAT_VX_OP_VDUP_V [
+ UNSPEC_VAADDU UNSPEC_VAADD
+])
+
(define_int_iterator VSAT_ARITH_OP [UNSPEC_VAADDU UNSPEC_VAADD
UNSPEC_VASUBU UNSPEC_VASUB UNSPEC_VSMUL])
(define_int_iterator VSAT_SHIFT_OP [UNSPEC_VSSRL UNSPEC_VSSRA])
@@ -4047,6 +4055,14 @@
(UNSPEC_VSSRA "vsshift") (UNSPEC_VNCLIP "vnclip")
(UNSPEC_VNCLIPU "vnclip")])
+(define_int_attr sat_op_v_vdup [
+ (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd")
+])
+
+(define_int_attr sat_op_vdup_v [
+ (UNSPEC_VAADDU "aaddu") (UNSPEC_VAADD "aadd")
+])
+
(define_int_attr misc_op [(UNSPEC_VMSBF "sbf") (UNSPEC_VMSIF "sif") (UNSPEC_VMSOF "sof")
(UNSPEC_VFRSQRT7 "rsqrt7")])
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index c498166..66b7670 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1551,20 +1551,44 @@
(define_expand "vec_duplicate<mode>"
[(set (match_operand:V_VLS 0 "register_operand")
(vec_duplicate:V_VLS
- (match_operand:<VEL> 1 "direct_broadcast_operand")))]
+ (match_operand:<VEL> 1 "any_broadcast_operand")))]
"TARGET_VECTOR"
{
- /* Early expand DImode broadcast in RV32 system to avoid RA reload
- generate (set (reg) (vec_duplicate:DI)). */
+ /* Don't keep a DImode broadcast for RV32 in the vec_duplicate form.
+ Otherwise combine or late combine could end up doing
+ "64-bit broadcast" (!= vmv.v.x)
+ + vadd.vv
+ = vadd.vx
+ which would be invalid. */
bool gt_p = maybe_gt (GET_MODE_SIZE (<VEL>mode), GET_MODE_SIZE (Pmode));
if (!FLOAT_MODE_P (<VEL>mode) && gt_p)
{
- riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
- riscv_vector::UNARY_OP, operands);
- DONE;
+ riscv_vector::emit_vlmax_insn
+ (code_for_pred_strided_broadcast
+ (<MODE>mode), riscv_vector::UNARY_OP, operands);
+ DONE;
}
- /* Otherwise, allow it fall into general vec_duplicate pattern
- which allow us to have vv->vx combine optimization in later pass. */
+
+ /* Even though we can eventually broadcast any permissible
+ constant by moving it into a register we need to force
+ any non-immediate one into a register here.
+ If we didn't do that we couldn't fwprop/late-combine
+ vec_duplicate 123.45f
+ + vfadd.vv
+ = vfadd.vf
+ because the constant is valid for vec_duplicate but not
+ for vfadd.vf. Therefore we need to do
+ fa0 = 123.45f
+ vec_duplicate fa0
+ + vfadd.vv
+ = vfadd.vf */
+ if (!satisfies_constraint_P (operands[1])
+ && !satisfies_constraint_J (operands[1])
+ && !rtx_equal_p (operands[1], CONST0_RTX (<VEL>mode))
+ && !memory_operand (operands[1], <VEL>mode))
+ operands[1] = force_reg (<VEL>mode, operands[1]);
+
+ /* Otherwise keep the vec_duplicate pattern until split. */
})
;; According to GCC internal:
@@ -1574,28 +1598,20 @@
(define_insn_and_split "*vec_duplicate<mode>"
[(set (match_operand:V_VLS 0 "register_operand")
(vec_duplicate:V_VLS
- (match_operand:<VEL> 1 "direct_broadcast_operand")))]
+ (match_operand:<VEL> 1 "any_broadcast_operand")))]
"TARGET_VECTOR && can_create_pseudo_p ()"
"#"
"&& 1"
[(const_int 0)]
{
- if (!strided_load_broadcast_p ()
- && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode)
- {
- /* For Float16, reinterpret as HImode, broadcast and reinterpret
- back. */
- poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
- machine_mode vmodehi
- = riscv_vector::get_vector_mode (HImode, nunits).require ();
- rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode),
- lowpart_subreg (HImode, operands[1], HFmode)};
- riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi),
- riscv_vector::UNARY_OP, ops);
- }
- else
+ if (riscv_vector::can_be_broadcast_p (operands[1]))
riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
riscv_vector::UNARY_OP, operands);
+ else
+ riscv_vector::emit_vlmax_insn (code_for_pred_strided_broadcast
+ (<MODE>mode), riscv_vector::UNARY_OP,
+ operands);
+
DONE;
}
[(set_attr "type" "vector")]
@@ -2141,69 +2157,45 @@
(match_operand:V_VLS 2 "vector_merge_operand")))]
"TARGET_VECTOR"
{
- /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f
- has better chances to do vsetvl fusion in vsetvl pass. */
bool wrap_vec_dup = true;
rtx vec_cst = NULL_RTX;
- if (riscv_vector::splat_to_scalar_move_p (operands))
- {
- operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
- operands[3] = force_reg (<VEL>mode, operands[3]);
- }
- else if (immediate_operand (operands[3], <VEL>mode)
- && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3]))
- && (/* -> pred_broadcast<mode>_zero */
- (vector_least_significant_set_mask_operand (operands[1],
- <VM>mode)
- && vector_const_0_operand (vec_cst, <MODE>mode))
- || (/* pred_broadcast<mode>_imm */
- vector_all_trues_mask_operand (operands[1], <VM>mode)
- && vector_const_int_or_double_0_operand (vec_cst,
- <MODE>mode))))
+ if (immediate_operand (operands[3], <VEL>mode)
+ && (vec_cst = gen_const_vec_duplicate (<MODE>mode, operands[3]))
+ && (/* -> pred_broadcast<mode>_zero */
+ (vector_least_significant_set_mask_operand (operands[1],
+ <VM>mode)
+ && vector_const_0_operand (vec_cst, <MODE>mode))
+ || (/* pred_broadcast<mode>_imm */
+ vector_all_trues_mask_operand (operands[1], <VM>mode)
+ && vector_const_int_or_double_0_operand (vec_cst,
+ <MODE>mode))))
{
operands[3] = vec_cst;
wrap_vec_dup = false;
}
- /* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar. */
- else if (satisfies_constraint_Wdm (operands[3]))
- {
- if (satisfies_constraint_Wb1 (operands[1]))
- {
- /* Case 1: vmv.s.x (TA, x == memory) ==> vlse.v (TA) */
- if (satisfies_constraint_vu (operands[2]))
- operands[1] = CONSTM1_RTX (<VM>mode);
- else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
- {
- /* Case 2: vmv.s.x (TU, x == memory) ==>
- vl = 0 or 1; + vlse.v (TU) in RV32 system */
- operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
- operands[1] = CONSTM1_RTX (<VM>mode);
- }
- else
- /* Case 3: load x (memory) to register. */
- operands[3] = force_reg (<VEL>mode, operands[3]);
- }
- }
- else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
- && (immediate_operand (operands[3], Pmode)
+ else if (GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD
+ && satisfies_constraint_Wb1 (operands[1])
+ && (immediate_operand (operands[3], Xmode)
|| (CONST_POLY_INT_P (operands[3])
&& known_ge (rtx_to_poly_int64 (operands[3]), 0U)
- && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode)))))
+ && known_le (rtx_to_poly_int64 (operands[3]),
+ GET_MODE_SIZE (<MODE>mode)))))
{
rtx tmp = gen_reg_rtx (Pmode);
poly_int64 value = rtx_to_poly_int64 (operands[3]);
- emit_move_insn (tmp, gen_int_mode (value, Pmode));
+ emit_move_insn (tmp, gen_int_mode (value, Xmode));
operands[3] = gen_rtx_SIGN_EXTEND (<VEL>mode, tmp);
}
- /* Never load (const_int 0) into a register, that's silly. */
- else if (operands[3] == CONST0_RTX (<VEL>mode))
+
+ /* For a vmv.v.x never load (const_int 0) or valid immediate operands
+ into a register, because we can use vmv.v.i. */
+ else if (satisfies_constraint_Wc1 (operands[1])
+ && (satisfies_constraint_P (operands[3])
+ || operands[3] == CONST0_RTX (<VEL>mode)))
;
- /* If we're broadcasting [-16..15] across more than just
- element 0, then we can use vmv.v.i directly, thus avoiding
- the load of the constant into a GPR. */
- else if (CONST_INT_P (operands[3])
- && IN_RANGE (INTVAL (operands[3]), -16, 15)
- && !satisfies_constraint_Wb1 (operands[1]))
+ /* For vmv.s.x we have vmv.s.x v1, zero. */
+ else if (satisfies_constraint_Wb1 (operands[1])
+ && operands[3] == CONST0_RTX (<VEL>mode))
;
else
operands[3] = force_reg (<VEL>mode, operands[3]);
@@ -2211,131 +2203,68 @@
operands[3] = gen_rtx_VEC_DUPLICATE (<MODE>mode, operands[3]);
})
-(define_insn_and_split "*pred_broadcast<mode>"
- [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vd, vd, vr, vr, vr, vr")
+(define_insn_and_rewrite "*pred_broadcast<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand" "=vr, vr, vr, vr")
(if_then_else:V_VLSI
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1, vm, vm,Wc1,Wc1,Wb1,Wb1")
- (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl,rvl,rvl,rvl,rvl")
- (match_operand 5 "const_int_operand" " i, i, i, i, i, i, i, i")
- (match_operand 6 "const_int_operand" " i, i, i, i, i, i, i, i")
- (match_operand 7 "const_int_operand" " i, i, i, i, i, i, i, i")
+ [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1")
+ (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(vec_duplicate:V_VLSI
- (match_operand:<VEL> 3 "direct_broadcast_operand" "rP,rP,Wdm,Wdm,Wdm,Wdm, rJ, rJ"))
- (match_operand:V_VLSI 2 "vector_merge_operand" "vu, 0, vu, 0, vu, 0, vu, 0")))]
+ (match_operand:<VEL> 3 "direct_broadcast_operand" " rP, rP, rJ, rJ"))
+ (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))]
"TARGET_VECTOR"
"@
vmv.v.%o3\t%0,%3
vmv.v.%o3\t%0,%3
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero
- vlse<sew>.v\t%0,%3,zero
vmv.s.x\t%0,%z3
vmv.s.x\t%0,%z3"
- "(register_operand (operands[3], <VEL>mode)
- || CONST_POLY_INT_P (operands[3]))
- && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)"
- [(const_int 0)]
- {
- gcc_assert (can_create_pseudo_p ());
- if (CONST_POLY_INT_P (operands[3]))
- {
- rtx tmp = gen_reg_rtx (<VEL>mode);
- emit_move_insn (tmp, operands[3]);
- operands[3] = tmp;
- }
-
- /* For SEW = 64 in RV32 system, we expand vmv.s.x:
- andi a2,a2,1
- vsetvl zero,a2,e64
- vlse64.v */
- if (satisfies_constraint_Wb1 (operands[1]))
- {
- operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
- operands[1] = CONSTM1_RTX (<VM>mode);
- }
-
- /* If the target doesn't want a strided-load broadcast we go with a regular
- V1DImode load and a broadcast gather. */
- if (strided_load_broadcast_p ())
- {
- rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
- GET_MODE_ALIGNMENT (<VEL>mode));
- mem = validize_mem (mem);
- emit_move_insn (mem, operands[3]);
- mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0)));
-
- emit_insn
- (gen_pred_broadcast<mode>
- (operands[0], operands[1], operands[2], mem,
- operands[4], operands[5], operands[6], operands[7]));
- }
- else
- {
- rtx tmp = gen_reg_rtx (V1DImode);
- emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3],
- <VEL>mode));
- tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
-
- emit_insn
- (gen_pred_gather<mode>_scalar
- (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
- operands[4], operands[5], operands[6], operands[7]));
- }
- DONE;
- }
- [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv")
+ "&& (operands[1] == CONSTM1_RTX (<VM>mode)
+ && operands[4] == CONST1_RTX (Pmode)
+ && (register_operand (operands[3], <VEL>mode)
+ || satisfies_constraint_J (operands[3])))"
+{
+ /* A broadcast of a single element is just a vmv.s.x. */
+ operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
+}
+ [(set_attr "type" "vimov,vimov,vimovxv,vimovxv")
(set_attr "mode" "<MODE>")])
-(define_insn "*pred_broadcast<mode>_zvfh"
- [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr")
+(define_insn_and_rewrite "pred_broadcast<mode>_zvfh"
+ [(set (match_operand:V_VLSF 0 "register_operand" "=vr, vr, vr, vr")
(if_then_else:V_VLSF
(unspec:<VM>
- [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1, Wc1, Wb1, Wb1")
- (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl")
- (match_operand 5 "const_int_operand" " i, i, i, i")
- (match_operand 6 "const_int_operand" " i, i, i, i")
- (match_operand 7 "const_int_operand" " i, i, i, i")
+ [(match_operand:<VM> 1 "vector_broadcast_mask_operand" "Wc1,Wc1,Wb1,Wb1")
+ (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(vec_duplicate:V_VLSF
- (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f"))
- (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ (match_operand:<VEL> 3 "direct_broadcast_operand" " f, f, f, f"))
+ (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))]
"TARGET_VECTOR"
"@
vfmv.v.f\t%0,%3
vfmv.v.f\t%0,%3
vfmv.s.f\t%0,%3
vfmv.s.f\t%0,%3"
+ "&& (operands[1] == CONSTM1_RTX (<VM>mode)
+ && operands[4] == CONST1_RTX (Pmode)
+ && (register_operand (operands[3], <VEL>mode)
+ || satisfies_constraint_J (operands[3])))"
+{
+ /* A broadcast of a single element is just a vfmv.s.f. */
+ operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
+}
[(set_attr "type" "vfmov,vfmov,vfmovfv,vfmovfv")
(set_attr "mode" "<MODE>")])
-(define_insn "*pred_broadcast<mode>_zvfhmin"
- [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr")
- (if_then_else:V_VLSF_ZVFHMIN
- (unspec:<VM>
- [(match_operand:<VM> 1 "vector_broadcast_mask_operand" " vm, vm, Wc1, Wc1")
- (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl")
- (match_operand 5 "const_int_operand" " i, i, i, i")
- (match_operand 6 "const_int_operand" " i, i, i, i")
- (match_operand 7 "const_int_operand" " i, i, i, i")
- (reg:SI VL_REGNUM)
- (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (vec_duplicate:V_VLSF_ZVFHMIN
- (match_operand:<VEL> 3 "direct_broadcast_operand" " A, A, A, A"))
- (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))]
- "TARGET_VECTOR && strided_load_broadcast_p ()"
- "@
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero,%1.t
- vlse<sew>.v\t%0,%3,zero
- vlse<sew>.v\t%0,%3,zero"
- [(set_attr "type" "vlds,vlds,vlds,vlds")
- (set_attr "mode" "<MODE>")])
-
(define_insn "*pred_broadcast<mode>_extended_scalar"
[(set (match_operand:V_VLSI_D 0 "register_operand" "=vr, vr, vr, vr")
(if_then_else:V_VLSI_D
@@ -2398,6 +2327,117 @@
[(set_attr "type" "vimov,vimov")
(set_attr "mode" "<MODE>")])
+(define_expand "@pred_strided_broadcast<mode>"
+ [(set (match_operand:V_VLS 0 "register_operand")
+ (if_then_else:V_VLS
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "strided_broadcast_mask_operand")
+ (match_operand 4 "vector_length_operand")
+ (match_operand 5 "const_int_operand")
+ (match_operand 6 "const_int_operand")
+ (match_operand 7 "const_int_operand")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (vec_duplicate:V_VLS
+ (match_operand:<VEL> 3 "strided_broadcast_operand"))
+ (match_operand:V_VLS 2 "vector_merge_operand")))]
+ "TARGET_VECTOR"
+{
+ if (satisfies_constraint_Wb1 (operands[1]))
+ {
+ /* If we're asked to set a single element (like vmv.s.x but we
+ need to go via memory here) and the tail policy is agnostic
+ we can overwrite all elements.
+ Thus, set the mask to broadcast. */
+ operands[1] = CONSTM1_RTX (<VM>mode);
+ if (!satisfies_constraint_vu (operands[2])
+ && GET_MODE_SIZE (<VEL>mode) > UNITS_PER_WORD)
+ {
+ /* Case 2: vmv.s.x (TU, x == memory) ==>
+ vl = 0 or 1; + vlse.v (TU) in RV32 system */
+ /* In this case we must not overwrite the residual elements,
+ so set the vector length to 0/1. */
+ operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
+ }
+ }
+})
+
+(define_insn_and_split "*pred_strided_broadcast<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand" "=vd, vd, vr, vr")
+ (if_then_else:V_VLSI
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm,Wc1,Wc1")
+ (match_operand 4 "vector_length_operand" "rvl,rvl,rvl,rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (vec_duplicate:V_VLSI
+ (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A"))
+ (match_operand:V_VLSI 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ "TARGET_VECTOR"
+ "@
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero
+ vlse<sew>.v\t%0,%3,zero"
+ "&& !strided_load_broadcast_p () && can_create_pseudo_p ()"
+ [(const_int 0)]
+ {
+ rtx tmp = gen_reg_rtx (V1DImode);
+ emit_move_insn (tmp, gen_lowpart (V1DImode, operands[3]));
+ tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
+
+ emit_insn
+ (gen_pred_gather<mode>_scalar
+ (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
+ operands[4], operands[5], operands[6], operands[7]));
+ DONE;
+ }
+ [(set_attr "type" "vlds,vlds,vlds,vlds")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*pred_strided_broadcast<mode>_zvfhmin"
+ [(set (match_operand:V_VLSF_ZVFHMIN 0 "register_operand" "=vr, vr, vr, vr")
+ (if_then_else:V_VLSF_ZVFHMIN
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "strided_broadcast_mask_operand" " vm, vm, Wc1, Wc1")
+ (match_operand 4 "vector_length_operand" "rvl, rvl, rvl, rvl")
+ (match_operand 5 "const_int_operand" " i, i, i, i")
+ (match_operand 6 "const_int_operand" " i, i, i, i")
+ (match_operand 7 "const_int_operand" " i, i, i, i")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (vec_duplicate:V_VLSF_ZVFHMIN
+ (match_operand:<VEL> 3 "strided_broadcast_operand" " A, A, A, A"))
+ (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))]
+ "TARGET_VECTOR"
+ "@
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero,%1.t
+ vlse<sew>.v\t%0,%3,zero
+ vlse<sew>.v\t%0,%3,zero"
+ "&& !strided_load_broadcast_p ()
+ && <VEL>mode == HFmode
+ && can_create_pseudo_p ()"
+ [(const_int 0)]
+ {
+ poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);
+ machine_mode vmodehi
+ = riscv_vector::get_vector_mode (HImode, nunits).require ();
+ rtx ops[] = {gen_lowpart (vmodehi, operands[0]),
+ gen_lowpart (HImode, operands[3])};
+ riscv_vector::emit_avltype_insn (code_for_pred_broadcast (vmodehi),
+ riscv_vector::UNARY_OP, ops,
+ (riscv_vector::avl_type) INTVAL (operands[7]),
+ operands[4]);
+ DONE;
+ }
+ [(set_attr "type" "vlds,vlds,vlds,vlds")
+ (set_attr "mode" "<MODE>")])
+
+
;; -------------------------------------------------------------------------------
;; ---- Predicated Strided loads/stores
;; -------------------------------------------------------------------------------