aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLehua Ding <lehua.ding@rivai.ai>2023-08-14 11:34:13 +0800
committerLehua Ding <lehua.ding@rivai.ai>2023-08-18 10:31:13 +0800
commit86d80395cf3c8832b669135b1ca7ea8258790c19 (patch)
tree2345cf8948bcf3a5871deb17e45a4380d1c6f929
parentc43916857c6586e65f10713fdc5a65909918a8cc (diff)
downloadgcc-86d80395cf3c8832b669135b1ca7ea8258790c19.zip
gcc-86d80395cf3c8832b669135b1ca7ea8258790c19.tar.gz
gcc-86d80395cf3c8832b669135b1ca7ea8258790c19.tar.bz2
RISC-V: Revert the convert from vmv.s.x to vmv.v.i
Hi, This patch revert the convert from vmv.s.x to vmv.v.i and add new pattern optimize the special case when the scalar operand is zero. Currently, the broadcast pattern where the scalar operand is a imm will be converted to vmv.v.i from vmv.s.x and the mask operand will be converted from 00..01 to 11..11. There are some advantages and disadvantages before and after the conversion after discussing with Juzhe offline and we chose not to do this transform. Before: Advantages: The vsetvli info required by vmv.s.x has better compatibility since vmv.s.x only required SEW and VLEN be zero or one. That mean there is more opportunities to combine with other vsetlv infos in vsetvl pass. Disadvantages: For non-zero scalar imm, one more `li rd, imm` instruction will be needed. After: Advantages: No need `li rd, imm` instruction since vmv.v.i support imm operand. Disadvantages: Like before's advantages. Worse compatibility leads to more vsetvl instrunctions need. Consider the bellow C code and asm after autovec. there is an extra insn (vsetivli zero, 1, e32, m1, ta, ma) after converted vmv.s.x to vmv.v.i. ``` int foo1(int* restrict a, int* restrict b, int *restrict c, int n) { int sum = 0; for (int i = 0; i < n; i++) sum += a[i] * b[i]; return sum; } ``` asm (Before): ``` foo1: ble a3,zero,.L7 vsetvli a2,zero,e32,m1,ta,ma vmv.v.i v1,0 .L6: vsetvli a5,a3,e32,m1,tu,ma slli a4,a5,2 sub a3,a3,a5 vle32.v v2,0(a0) vle32.v v3,0(a1) add a0,a0,a4 add a1,a1,a4 vmacc.vv v1,v3,v2 bne a3,zero,.L6 vsetvli a2,zero,e32,m1,ta,ma vmv.s.x v2,zero vredsum.vs v1,v1,v2 vmv.x.s a0,v1 ret .L7: li a0,0 ret ``` asm (After): ``` foo1: ble a3,zero,.L4 vsetvli a2,zero,e32,m1,ta,ma vmv.v.i v1,0 .L3: vsetvli a5,a3,e32,m1,tu,ma slli a4,a5,2 sub a3,a3,a5 vle32.v v2,0(a0) vle32.v v3,0(a1) add a0,a0,a4 add a1,a1,a4 vmacc.vv v1,v3,v2 bne a3,zero,.L3 vsetivli zero,1,e32,m1,ta,ma vmv.v.i v2,0 vsetvli a2,zero,e32,m1,ta,ma vredsum.vs v1,v1,v2 vmv.x.s a0,v1 ret .L4: li a0,0 ret ``` Best, Lehua Co-Authored-By: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> gcc/ChangeLog: * config/riscv/predicates.md (vector_const_0_operand): New. * config/riscv/vector.md (*pred_broadcast<mode>_zero): Ditto. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/base/scalar_move-5.c: Update. * gcc.target/riscv/rvv/base/scalar_move-6.c: Ditto.
-rw-r--r--gcc/config/riscv/predicates.md4
-rw-r--r--gcc/config/riscv/vector.md43
-rw-r--r--gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c20
-rw-r--r--gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c22
4 files changed, 70 insertions, 19 deletions
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index b6ebdcf..618ad60 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -292,6 +292,10 @@
(ior (match_operand 0 "register_operand")
(match_operand 0 "const_int_operand")))
+(define_predicate "vector_const_0_operand"
+ (and (match_code "const_vector")
+ (match_test "satisfies_constraint_Wc0 (op)")))
+
(define_predicate "vector_move_operand"
(ior (match_operand 0 "nonimmediate_operand")
(and (match_code "const_vector")
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index ff84f3f..e772e79 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1728,23 +1728,24 @@
(match_operand:V_VLS 2 "vector_merge_operand")))]
"TARGET_VECTOR"
{
- /* Handle vmv.s.x instruction which has memory scalar. */
- if (satisfies_constraint_Wdm (operands[3]) || riscv_vector::simm5_p (operands[3])
- || rtx_equal_p (operands[3], CONST0_RTX (<VEL>mode)))
+ /* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar. */
+ if (satisfies_constraint_Wdm (operands[3]))
{
if (satisfies_constraint_Wb1 (operands[1]))
- {
- // Case 1: vmv.s.x (TA) ==> vlse.v (TA)
- if (satisfies_constraint_vu (operands[2]))
- operands[1] = CONSTM1_RTX (<VM>mode);
- else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
- {
- // Case 2: vmv.s.x (TU) ==> andi vl + vlse.v (TU) in RV32 system.
+ {
+ /* Case 1: vmv.s.x (TA, x == memory) ==> vlse.v (TA) */
+ if (satisfies_constraint_vu (operands[2]))
+ operands[1] = CONSTM1_RTX (<VM>mode);
+ else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
+ {
+ /* Case 2: vmv.s.x (TU, x == memory) ==>
+ vl = 0 or 1; + vlse.v (TU) in RV32 system */
operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
operands[1] = CONSTM1_RTX (<VM>mode);
}
- else
- operands[3] = force_reg (<VEL>mode, operands[3]);
+ else
+ /* Case 3: load x (memory) to register. */
+ operands[3] = force_reg (<VEL>mode, operands[3]);
}
}
else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
@@ -1874,6 +1875,24 @@
[(set_attr "type" "vimov,vimov,vimovxv,vimovxv")
(set_attr "mode" "<MODE>")])
+(define_insn "*pred_broadcast<mode>_zero"
+ [(set (match_operand:V_VLS 0 "register_operand" "=vr, vr")
+ (if_then_else:V_VLS
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "vector_least_significant_set_mask_operand" "Wb1, Wb1")
+ (match_operand 4 "vector_length_operand" " rK, rK")
+ (match_operand 5 "const_int_operand" " i, i")
+ (match_operand 6 "const_int_operand" " i, i")
+ (match_operand 7 "const_int_operand" " i, i")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (match_operand:V_VLS 3 "vector_const_0_operand" "Wc0, Wc0")
+ (match_operand:V_VLS 2 "vector_merge_operand" " vu, 0")))]
+ "TARGET_VECTOR"
+ "vmv.s.x\t%0,zero"
+ [(set_attr "type" "vimovxv,vimovxv")
+ (set_attr "mode" "<MODE>")])
+
;; -------------------------------------------------------------------------------
;; ---- Predicated Strided loads/stores
;; -------------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
index db6800c..2e897a4 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
@@ -121,7 +121,7 @@ void foo8 (void *base, void *out, size_t vl, double x)
/*
** foo9:
** ...
-** vmv.v.i\tv[0-9]+,\s*-15
+** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+
** ...
** ret
*/
@@ -150,7 +150,7 @@ void foo10 (void *base, void *out, size_t vl)
/*
** foo11:
** ...
-** vmv.v.i\tv[0-9]+,\s*0
+** vmv.s.x\tv[0-9]+,\s*zero
** ...
** ret
*/
@@ -164,7 +164,7 @@ void foo11 (void *base, void *out, size_t vl)
/*
** foo12:
** ...
-** vfmv.s.f\tv[0-9]+,\s*[a-x0-9]+
+** vmv.s.x\tv[0-9]+,\s*zero
** ...
** ret
*/
@@ -174,3 +174,17 @@ void foo12 (void *base, void *out, size_t vl)
vfloat64m2_t v = __riscv_vfmv_s_f_f64m2_tu (merge, 0, vl);
*(vfloat64m2_t*)out = v;
}
+
+/*
+** foo13:
+** ...
+** vfmv.s.f\tv[0-9]+,\s*[a-x0-9]+
+** ...
+** ret
+*/
+void foo13 (void *base, void *out, size_t vl)
+{
+ vfloat64m2_t merge = *(vfloat64m2_t*) (base + 200);
+ vfloat64m2_t v = __riscv_vfmv_s_f_f64m2_tu (merge, 0.2, vl);
+ *(vfloat64m2_t*)out = v;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
index f27f85c..326cfd8 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
@@ -119,7 +119,7 @@ void foo8 (void *base, void *out, size_t vl, double x)
/*
** foo9:
** ...
-** vmv.v.i\tv[0-9]+,\s*-15
+** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+
** ...
** ret
*/
@@ -133,7 +133,7 @@ void foo9 (void *base, void *out, size_t vl)
/*
** foo10:
** ...
-** vmv.v.i\tv[0-9]+,\s*-15
+** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+
** ...
*/
void foo10 (void *base, void *out, size_t vl)
@@ -147,7 +147,7 @@ void foo10 (void *base, void *out, size_t vl)
/*
** foo11:
** ...
-** vmv.v.i\tv[0-9]+,\s*0
+** vmv.s.x\tv[0-9]+,\s*zero
** ...
** ret
*/
@@ -161,7 +161,7 @@ void foo11 (void *base, void *out, size_t vl)
/*
** foo12:
** ...
-** vmv.v.i\tv[0-9]+,\s*0
+** vmv.s.x\tv[0-9]+,\s*zero
** ...
** ret
*/
@@ -173,6 +173,20 @@ void foo12 (void *base, void *out, size_t vl)
}
/*
+** foo12_1:
+** ...
+** vfmv.s.f\tv[0-9]+,\s*[a-x0-9]+
+** ...
+** ret
+*/
+void foo12_1 (void *base, void *out, size_t vl)
+{
+ vfloat64m2_t merge = *(vfloat64m2_t*) (base + 200);
+ vfloat64m2_t v = __riscv_vfmv_s_f_f64m2_tu (merge, 0.2, vl);
+ *(vfloat64m2_t*)out = v;
+}
+
+/*
** foo13:
** ...
** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+