RISC-V: Enhance AVL propagation for complicate reduction auto-vectorization

I notice we failed to AVL propagate for reduction with more complicate situation: double foo (double *__restrict a, double *__restrict b, double *__restrict c, int n) { double result = 0; for (int i = 0; i < n; i++) result += a[i] * b[i] * c[i]; return result; } vsetvli a5,a3,e8,mf8,ta,ma -> should be fused into e64m1,TU slli a4,a5,3 vle64.v v3,0(a0) vle64.v v1,0(a1) vsetvli a6,zero,e64,m1,ta,ma -> redundant vfmul.vv v1,v1,v3 vsetvli zero,a5,e64,m1,tu,ma -> redundant vle64.v v3,0(a2) vfmacc.vv v2,v1,v3 add a0,a0,a4 add a1,a1,a4 add a2,a2,a4 sub a3,a3,a5 bne a3,zero,.L3 The failed AVL propgation causes redundant AVL/VL togglling. The root cause as follows: vsetvl a5, zero vadd.vv def r136 vsetvl zero, a3, ... TU vsub.vv (use r136) We propagate AVL (r136) from 'vsub.vv' into 'vadd.vv' when 'vsub.vv' is TA policy. However, it's too restrict so we missed optimization here. We enhance AVL propation for TU policy for following situation: vsetvl a5, zero vadd.vv def r136 vsetvl zero, a3, ... TU vsub.vv (use r136, merge != r136) Note that we should only propagate AVL when merge != r136 for 'vsub.vv' doesn't depend on the tail elements. After this patch: vsetvli a5,a3,e64,m1,tu,ma slli a4,a5,3 vle64.v v3,0(a0) vle64.v v1,0(a1) vfmul.vv v1,v1,v3 vle64.v v3,0(a2) vfmacc.vv v2,v3,v1 add a0,a0,a4 add a1,a1,a4 add a2,a2,a4 sub a3,a3,a5 bne a3,zero,.L3 PR target/112399 gcc/ChangeLog: * config/riscv/riscv-avlprop.cc (pass_avlprop::get_vlmax_ta_preferred_avl): Enhance AVL propagation. * config/riscv/t-riscv: Add new include. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/imm_switch-2.c: Adapt test. * gcc.target/riscv/rvv/autovec/pr112399.c: New test.
author: Juzhe-Zhong <juzhe.zhong@rivai.ai> 2023-11-06 11:34:26 +0800
committer: Pan Li <pan2.li@intel.com> 2023-11-07 15:01:58 +0800
commit: f1e084c6c3ef1d1233e35823dacfdf9cee722430 (patch)
tree: 8910c220173b8208e7485f6c48fda3ca203cea66 /gcc
parent: fd56a9cc5e37bb51c725a83aa44de34b9c238e78 (diff)
download: gcc-f1e084c6c3ef1d1233e35823dacfdf9cee722430.zip
gcc-f1e084c6c3ef1d1233e35823dacfdf9cee722430.tar.gz
gcc-f1e084c6c3ef1d1233e35823dacfdf9cee722430.tar.bz2
4 files changed, 49 insertions, 5 deletions
diff --git a/gcc/config/riscv/riscv-avlprop.cc b/gcc/config/riscv/riscv-avlprop.cc
index 1dfaa87..1f6ba40 100644
--- a/gcc/config/riscv/riscv-avlprop.cc
+++ b/gcc/config/riscv/riscv-avlprop.cc
@@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "rtl-ssa.h"
 #include "cfgcleanup.h"
 #include "insn-attr.h"
+#include "tm-constrs.h"
 
 using namespace rtl_ssa;
 using namespace riscv_vector;
@@ -285,8 +286,20 @@ pass_avlprop::get_vlmax_ta_preferred_avl (insn_info *insn) const
 	  if (!use_insn->can_be_optimized () || use_insn->is_asm ()
 	      || use_insn->is_call () || use_insn->has_volatile_refs ()
 	      || use_insn->has_pre_post_modify ()
-	      || !has_vl_op (use_insn->rtl ())
-	      || !tail_agnostic_p (use_insn->rtl ()))
+	      || !has_vl_op (use_insn->rtl ()))
+	    return NULL_RTX;
+
+	  /* We should only propagate non-VLMAX AVL into VLMAX insn when
+	     such insn potential tail elements (after propagation) are
+	     not used.  So, we should make sure the outcome of VLMAX insn
+	     is not depend on.  */
+	  extract_insn_cached (use_insn->rtl ());
+	  int merge_op_idx = get_attr_merge_op_idx (use_insn->rtl ());
+	  if (merge_op_idx != INVALID_ATTRIBUTE
+	      && !satisfies_constraint_vu (recog_data.operand[merge_op_idx])
+	      && refers_to_regno_p (set->regno (),
+				    recog_data.operand[merge_op_idx])
+	      && !tail_agnostic_p (use_insn->rtl ()))
 	    return NULL_RTX;
 
 	  int new_sew = get_sew (use_insn->rtl ());
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index f8ca3f4..95becfc 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -80,7 +80,8 @@ riscv-vector-costs.o: $(srcdir)/config/riscv/riscv-vector-costs.cc \
 
 riscv-avlprop.o: $(srcdir)/config/riscv/riscv-avlprop.cc \
   $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(REGS_H) \
-  $(TARGET_H) tree-pass.h df.h rtl-ssa.h cfgcleanup.h insn-attr.h 
+  $(TARGET_H) tree-pass.h df.h rtl-ssa.h cfgcleanup.h insn-attr.h \
+  tm-constrs.h
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/riscv/riscv-avlprop.cc
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112399.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112399.c
new file mode 100644
index 0000000..948e12b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112399.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-vect-cost-model -ffast-math -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+
+/*
+** foo:
+**   ...
+**   vsetvli\s*[a-x0-9]+,\s*[a-x0-9]+,\s*e64,\s*m1,\s*tu,\s*m[au]
+**   slli\s*[a-x0-9]+,\s*[a-x0-9]+,\s*3
+**   vle64\.v\s*v[0-9]+,\s*0\([a-x0-9]+\)
+**   vle64\.v\s*v[0-9]+,\s*0\([a-x0-9]+\)
+**   vfmul\.vv\s*v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+**   vle64\.v\s*\s*v[0-9]+,\s*0\([a-x0-9]+\)
+**   vfmacc\.vv\s*\s*v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+**   add\s*[a-x0-9]+,\s*[a-x0-9]+,\s*[a-x0-9]+
+**   add\s*[a-x0-9]+,\s*[a-x0-9]+,\s*[a-x0-9]+
+**   add\s*[a-x0-9]+,\s*[a-x0-9]+,\s*[a-x0-9]+
+**   sub\s*[a-x0-9]+,\s*[a-x0-9]+,\s*[a-x0-9]+
+**   ...
+*/
+
+double
+foo (double *__restrict a, double *__restrict b, double *__restrict c, int n)
+{
+  double result = 0;
+  for (int i = 0; i < n; i++)
+    result += a[i] * b[i] * c[i];
+  return result;
+}
+
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_switch-2.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_switch-2.c
index 2e58f08..c55faa5 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_switch-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_switch-2.c
@@ -23,6 +23,5 @@ void f (void * restrict in, void * restrict out, void * restrict mask_in, int n)
 /* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*19,\s*e32,\s*mf2,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
 /* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*19,\s*e16,\s*mf2,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
 /* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e32,\s*mf2,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
 /* { dg-final { scan-assembler-times {vsetivli} 4 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
author	Juzhe-Zhong <juzhe.zhong@rivai.ai>	2023-11-06 11:34:26 +0800
committer	Pan Li <pan2.li@intel.com>	2023-11-07 15:01:58 +0800
commit	f1e084c6c3ef1d1233e35823dacfdf9cee722430 (patch)
tree	8910c220173b8208e7485f6c48fda3ca203cea66 /gcc
parent	fd56a9cc5e37bb51c725a83aa44de34b9c238e78 (diff)
download	gcc-f1e084c6c3ef1d1233e35823dacfdf9cee722430.zip gcc-f1e084c6c3ef1d1233e35823dacfdf9cee722430.tar.gz gcc-f1e084c6c3ef1d1233e35823dacfdf9cee722430.tar.bz2