aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJu-Zhe Zhong <juzhe.zhong@rivai.ai>2023-05-25 22:42:35 +0800
committerPan Li <pan2.li@intel.com>2023-05-25 22:48:22 +0800
commitf574e2dfae79055f16d0c63cc12df24815d8ead6 (patch)
treee12b557836a13a5756eaf96052bfdfd119fc8f92 /gcc
parent560bb845321f5ad039a318a081b0e88d9900f5cb (diff)
downloadgcc-f574e2dfae79055f16d0c63cc12df24815d8ead6.zip
gcc-f574e2dfae79055f16d0c63cc12df24815d8ead6.tar.gz
gcc-f574e2dfae79055f16d0c63cc12df24815d8ead6.tar.bz2
VECT: Add decrement IV iteration loop control by variable amount support
This patch is supporting decrement IV by following the flow designed by Richard: (1) In vect_set_loop_condition_partial_vectors, for the first iteration of: call vect_set_loop_controls_directly. (2) vect_set_loop_controls_directly calculates "step" as in your patch. If rgc has 1 control, this step is the SSA name created for that control. Otherwise the step is a fresh SSA name, as in your patch. (3) vect_set_loop_controls_directly stores this step somewhere for later use, probably in LOOP_VINFO. Let's use "S" to refer to this stored step. (4) After the vect_set_loop_controls_directly call above, and outside the "if" statement that now contains vect_set_loop_controls_directly, check whether rgc->controls.length () > 1. If so, use vect_adjust_loop_lens_control to set the controls based on S. Then the only caller of vect_adjust_loop_lens_control is vect_set_loop_condition_partial_vectors. And the starting step for vect_adjust_loop_lens_control is always S. This patch has well tested for single-rgroup and multiple-rgroup (SLP) and passed all testcase in RISC-V port. Signed-off-by: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> Co-Authored-By: Richard Sandiford <richard.sandiford@arm.com> gcc/ChangeLog: * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function. (vect_set_loop_controls_directly): Add decrement IV support. (vect_set_loop_condition_partial_vectors): Ditto. * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): New variable. * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-3.c: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-4.c: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-3.c: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-4.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-3.c288
-rw-r--r--gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-4.c75
-rw-r--r--gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-3.c36
-rw-r--r--gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-4.c15
-rw-r--r--gcc/tree-vect-loop-manip.cc136
-rw-r--r--gcc/tree-vect-loop.cc12
-rw-r--r--gcc/tree-vectorizer.h8
7 files changed, 558 insertions, 12 deletions
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-3.c
new file mode 100644
index 0000000..9579749
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-3.c
@@ -0,0 +1,288 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax" } */
+
+#include <stdint-gcc.h>
+
+void __attribute__ ((noinline, noclone))
+f0 (int8_t *__restrict x, int16_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 4, j += 8)
+ {
+ x[i + 0] += 1;
+ x[i + 1] += 2;
+ x[i + 2] += 3;
+ x[i + 3] += 4;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ y[j + 2] += 3;
+ y[j + 3] += 4;
+ y[j + 4] += 5;
+ y[j + 5] += 6;
+ y[j + 6] += 7;
+ y[j + 7] += 8;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f0_init (int8_t *__restrict x, int8_t *__restrict x2, int16_t *__restrict y,
+ int16_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 4, j += 8)
+ {
+ x[i + 0] = i % 120;
+ x[i + 1] = i % 78;
+ x[i + 2] = i % 55;
+ x[i + 3] = i % 27;
+ y[j + 0] = j % 33;
+ y[j + 1] = j % 44;
+ y[j + 2] = j % 66;
+ y[j + 3] = j % 88;
+ y[j + 4] = j % 99;
+ y[j + 5] = j % 39;
+ y[j + 6] = j % 49;
+ y[j + 7] = j % 101;
+
+ x2[i + 0] = i % 120;
+ x2[i + 1] = i % 78;
+ x2[i + 2] = i % 55;
+ x2[i + 3] = i % 27;
+ y2[j + 0] = j % 33;
+ y2[j + 1] = j % 44;
+ y2[j + 2] = j % 66;
+ y2[j + 3] = j % 88;
+ y2[j + 4] = j % 99;
+ y2[j + 5] = j % 39;
+ y2[j + 6] = j % 49;
+ y2[j + 7] = j % 101;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f0_golden (int8_t *__restrict x, int16_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 4, j += 8)
+ {
+ x[i + 0] += 1;
+ x[i + 1] += 2;
+ x[i + 2] += 3;
+ x[i + 3] += 4;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ y[j + 2] += 3;
+ y[j + 3] += 4;
+ y[j + 4] += 5;
+ y[j + 5] += 6;
+ y[j + 6] += 7;
+ y[j + 7] += 8;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f0_check (int8_t *__restrict x, int8_t *__restrict x2, int16_t *__restrict y,
+ int16_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 4, j += 8)
+ {
+ if (x[i + 0] != x2[i + 0])
+ __builtin_abort ();
+ if (x[i + 1] != x2[i + 1])
+ __builtin_abort ();
+ if (x[i + 2] != x2[i + 2])
+ __builtin_abort ();
+ if (x[i + 3] != x2[i + 3])
+ __builtin_abort ();
+ if (y[j + 0] != y2[j + 0])
+ __builtin_abort ();
+ if (y[j + 1] != y2[j + 1])
+ __builtin_abort ();
+ if (y[j + 2] != y2[j + 2])
+ __builtin_abort ();
+ if (y[j + 3] != y2[j + 3])
+ __builtin_abort ();
+ if (y[j + 4] != y2[j + 4])
+ __builtin_abort ();
+ if (y[j + 5] != y2[j + 5])
+ __builtin_abort ();
+ if (y[j + 6] != y2[j + 6])
+ __builtin_abort ();
+ if (y[j + 7] != y2[j + 7])
+ __builtin_abort ();
+ }
+}
+
+void __attribute__ ((noinline, noclone))
+f1 (int16_t *__restrict x, int32_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 2, j += 4)
+ {
+ x[i + 0] += 1;
+ x[i + 1] += 2;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ y[j + 2] += 3;
+ y[j + 3] += 4;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f1_init (int16_t *__restrict x, int16_t *__restrict x2, int32_t *__restrict y,
+ int32_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 2, j += 4)
+ {
+ x[i + 0] = i % 67;
+ x[i + 1] = i % 76;
+ y[j + 0] = j % 111;
+ y[j + 1] = j % 63;
+ y[j + 2] = j % 39;
+ y[j + 3] = j % 8;
+
+ x2[i + 0] = i % 67;
+ x2[i + 1] = i % 76;
+ y2[j + 0] = j % 111;
+ y2[j + 1] = j % 63;
+ y2[j + 2] = j % 39;
+ y2[j + 3] = j % 8;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f1_golden (int16_t *__restrict x, int32_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 2, j += 4)
+ {
+ x[i + 0] += 1;
+ x[i + 1] += 2;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ y[j + 2] += 3;
+ y[j + 3] += 4;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f1_check (int16_t *__restrict x, int16_t *__restrict x2, int32_t *__restrict y,
+ int32_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 2, j += 4)
+ {
+ if (x[i + 0] != x2[i + 0])
+ __builtin_abort ();
+ if (x[i + 1] != x2[i + 1])
+ __builtin_abort ();
+ if (y[j + 0] != y2[j + 0])
+ __builtin_abort ();
+ if (y[j + 1] != y2[j + 1])
+ __builtin_abort ();
+ if (y[j + 2] != y2[j + 2])
+ __builtin_abort ();
+ if (y[j + 3] != y2[j + 3])
+ __builtin_abort ();
+ }
+}
+
+void __attribute__ ((noinline, noclone))
+f2 (int32_t *__restrict x, int64_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 1, j += 2)
+ {
+ x[i + 0] += 1;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f2_init (int32_t *__restrict x, int32_t *__restrict x2, int64_t *__restrict y,
+ int64_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 1, j += 2)
+ {
+ x[i + 0] = i % 79;
+ y[j + 0] = j % 83;
+ y[j + 1] = j % 100;
+
+ x2[i + 0] = i % 79;
+ y2[j + 0] = j % 83;
+ y2[j + 1] = j % 100;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f2_golden (int32_t *__restrict x, int64_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 1, j += 2)
+ {
+ x[i + 0] += 1;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ }
+}
+
+void __attribute__ ((noinline, noclone))
+f2_check (int32_t *__restrict x, int32_t *__restrict x2, int64_t *__restrict y,
+ int64_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 1, j += 2)
+ {
+ if (x[i + 0] != x2[i + 0])
+ __builtin_abort ();
+ if (y[j + 0] != y2[j + 0])
+ __builtin_abort ();
+ if (y[j + 1] != y2[j + 1])
+ __builtin_abort ();
+ }
+}
+
+void __attribute__ ((noinline, noclone))
+f3 (int8_t *__restrict x, int64_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 1, j += 2)
+ {
+ x[i + 0] += 1;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ }
+}
+
+void __attribute__ ((noinline, noclone))
+f3_init (int8_t *__restrict x, int8_t *__restrict x2, int64_t *__restrict y,
+ int64_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 1, j += 2)
+ {
+ x[i + 0] = i % 22;
+ y[j + 0] = i % 12;
+ y[j + 1] = i % 21;
+
+ x2[i + 0] = i % 22;
+ y2[j + 0] = i % 12;
+ y2[j + 1] = i % 21;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f3_golden (int8_t *__restrict x, int64_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 1, j += 2)
+ {
+ x[i + 0] += 1;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ }
+}
+
+void __attribute__ ((noinline, noclone))
+f3_check (int8_t *__restrict x, int8_t *__restrict x2, int64_t *__restrict y,
+ int64_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 1, j += 2)
+ {
+ if (x[i + 0] != x2[i + 0])
+ __builtin_abort ();
+ if (y[j + 0] != y2[j + 0])
+ __builtin_abort ();
+ if (y[j + 1] != y2[j + 1])
+ __builtin_abort ();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-4.c
new file mode 100644
index 0000000..e87961e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-4.c
@@ -0,0 +1,75 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax" } */
+
+#include <stdint-gcc.h>
+
+void __attribute__ ((noinline, noclone))
+f (uint64_t *__restrict x, uint16_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 2, j += 4)
+ {
+ x[i + 0] += 1;
+ x[i + 1] += 2;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ y[j + 2] += 3;
+ y[j + 3] += 4;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f_init (uint64_t *__restrict x, uint64_t *__restrict x2, uint16_t *__restrict y,
+ uint16_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 2, j += 4)
+ {
+ x[i + 0] = i * 897 + 189;
+ x[i + 1] = i * 79 + 55963;
+ y[j + 0] = j * 18 + 78642;
+ y[j + 1] = j * 9 + 8634;
+ y[j + 2] = j * 78 + 2588;
+ y[j + 3] = j * 38 + 8932;
+
+ x2[i + 0] = i * 897 + 189;
+ x2[i + 1] = i * 79 + 55963;
+ y2[j + 0] = j * 18 + 78642;
+ y2[j + 1] = j * 9 + 8634;
+ y2[j + 2] = j * 78 + 2588;
+ y2[j + 3] = j * 38 + 8932;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f_golden (uint64_t *__restrict x, uint16_t *__restrict y, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 2, j += 4)
+ {
+ x[i + 0] += 1;
+ x[i + 1] += 2;
+ y[j + 0] += 1;
+ y[j + 1] += 2;
+ y[j + 2] += 3;
+ y[j + 3] += 4;
+ }
+}
+
+void __attribute__ ((optimize (0)))
+f_check (uint64_t *__restrict x, uint64_t *__restrict x2,
+ uint16_t *__restrict y, uint16_t *__restrict y2, int n)
+{
+ for (int i = 0, j = 0; i < n; i += 2, j += 4)
+ {
+ if (x[i + 0] != x2[i + 0])
+ __builtin_abort ();
+ if (x[i + 1] != x2[i + 1])
+ __builtin_abort ();
+ if (y[j + 0] != y2[j + 0])
+ __builtin_abort ();
+ if (y[j + 1] != y2[j + 1])
+ __builtin_abort ();
+ if (y[j + 2] != y2[j + 2])
+ __builtin_abort ();
+ if (y[j + 3] != y2[j + 3])
+ __builtin_abort ();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-3.c
new file mode 100644
index 0000000..b786738
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-3.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=fixed-vlmax" } */
+
+#include "multiple_rgroup-3.c"
+
+int __attribute__ ((optimize (0))) main (void)
+{
+ int8_t f0_x[3108], f0_x2[3108];
+ int16_t f0_y[6216], f0_y2[6216];
+ f0_init (f0_x, f0_x2, f0_y, f0_y2, 3108);
+ f0 (f0_x, f0_y, 3108);
+ f0_golden (f0_x2, f0_y2, 3108);
+ f0_check (f0_x, f0_x2, f0_y, f0_y2, 3108);
+
+ int16_t f1_x[1998], f1_x2[1998];
+ int32_t f1_y[3996], f1_y2[3996];
+ f1_init (f1_x, f1_x2, f1_y, f1_y2, 1998);
+ f1 (f1_x, f1_y, 1998);
+ f1_golden (f1_x2, f1_y2, 1998);
+ f1_check (f1_x, f1_x2, f1_y, f1_y2, 1998);
+
+ int32_t f2_x[2023], f2_x2[2023];
+ int64_t f2_y[4046], f2_y2[4046];
+ f2_init (f2_x, f2_x2, f2_y, f2_y2, 2023);
+ f2 (f2_x, f2_y, 2023);
+ f2_golden (f2_x2, f2_y2, 2023);
+ f2_check (f2_x, f2_x2, f2_y, f2_y2, 2023);
+
+ int8_t f3_x[3203], f3_x2[3203];
+ int64_t f3_y[6406], f3_y2[6406];
+ f3_init (f3_x, f3_x2, f3_y, f3_y2, 3203);
+ f3 (f3_x, f3_y, 3203);
+ f3_golden (f3_x2, f3_y2, 3203);
+ f3_check (f3_x, f3_x2, f3_y, f3_y2, 3203);
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-4.c
new file mode 100644
index 0000000..7751384
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-4.c
@@ -0,0 +1,15 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=fixed-vlmax" } */
+
+#include "multiple_rgroup-4.c"
+
+int __attribute__ ((optimize (0))) main (void)
+{
+ uint64_t f_x[3108], f_x2[3108];
+ uint16_t f_y[6216], f_y2[6216];
+ f_init (f_x, f_x2, f_y, f_y2, 3108);
+ f (f_x, f_y, 3108);
+ f_golden (f_x2, f_y2, 3108);
+ f_check (f_x, f_x2, f_y, f_y2, 3108);
+ return 0;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index ff6159e..acf3642 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -385,6 +385,63 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
return false;
}
+/* Populate DEST_RGM->controls, given that they should add up to STEP.
+
+ STEP = MIN_EXPR <ivtmp_34, VF>;
+
+ First length (MIN (X, VF/N)):
+ loop_len_15 = MIN_EXPR <STEP, VF/N>;
+
+ Second length:
+ tmp = STEP - loop_len_15;
+ loop_len_16 = MIN (tmp, VF/N);
+
+ Third length:
+ tmp2 = tmp - loop_len_16;
+ loop_len_17 = MIN (tmp2, VF/N);
+
+ Last length:
+ loop_len_18 = tmp2 - loop_len_17;
+*/
+
+static void
+vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
+ rgroup_controls *dest_rgm, tree step)
+{
+ tree ctrl_type = dest_rgm->type;
+ poly_uint64 nitems_per_ctrl
+ = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
+ tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
+
+ for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
+ {
+ tree ctrl = dest_rgm->controls[i];
+ if (i == 0)
+ {
+ /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */
+ gassign *assign
+ = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+ gimple_seq_add_stmt (seq, assign);
+ }
+ else if (i == dest_rgm->controls.length () - 1)
+ {
+ /* Last iteration: Remain capped to the range [0, VF/N]. */
+ gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
+ dest_rgm->controls[i - 1]);
+ gimple_seq_add_stmt (seq, assign);
+ }
+ else
+ {
+ /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
+ step = gimple_build (seq, MINUS_EXPR, iv_type, step,
+ dest_rgm->controls[i - 1]);
+ gassign *assign
+ = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+ gimple_seq_add_stmt (seq, assign);
+ }
+ }
+}
+
/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
for all the rgroup controls in RGC and return a control that is nonzero
when the loop needs to iterate. Add any new preheader statements to
@@ -425,7 +482,8 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
gimple_seq *header_seq,
gimple_stmt_iterator loop_cond_gsi,
rgroup_controls *rgc, tree niters,
- tree niters_skip, bool might_wrap_p)
+ tree niters_skip, bool might_wrap_p,
+ tree *iv_step)
{
tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
@@ -468,6 +526,39 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
gimple_stmt_iterator incr_gsi;
bool insert_after;
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+ if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+ {
+ /* Create an IV that counts down from niters_total and whose step
+ is the (variable) amount processed in the current iteration:
+ ...
+ _10 = (unsigned long) count_12(D);
+ ...
+ # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
+ _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+ ...
+ vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
+ ...
+ ivtmp_35 = ivtmp_9 - _36;
+ ...
+ if (ivtmp_35 != 0)
+ goto <bb 4>; [83.33%]
+ else
+ goto <bb 5>; [16.67%]
+ */
+ nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+ tree step = rgc->controls.length () == 1 ? rgc->controls[0]
+ : make_ssa_name (iv_type);
+ /* Create decrement IV. */
+ create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
+ insert_after, &index_before_incr, &index_after_incr);
+ gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
+ index_before_incr,
+ nitems_step));
+ *iv_step = step;
+ return index_after_incr;
+ }
+
+ /* Create increment IV. */
create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
loop, &incr_gsi, insert_after, &index_before_incr,
&index_after_incr);
@@ -733,7 +824,9 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
the first control from any rgroup for the loop condition; here we
arbitrarily pick the last. */
tree test_ctrl = NULL_TREE;
+ tree iv_step = NULL_TREE;
rgroup_controls *rgc;
+ rgroup_controls *iv_rgc = nullptr;
unsigned int i;
auto_vec<rgroup_controls> *controls = use_masks_p
? &LOOP_VINFO_MASKS (loop_vinfo)
@@ -753,17 +846,36 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
continue;
}
- /* See whether zero-based IV would ever generate all-false masks
- or zero length before wrapping around. */
- bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
-
- /* Set up all controls for this group. */
- test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
- &preheader_seq,
- &header_seq,
- loop_cond_gsi, rgc,
- niters, niters_skip,
- might_wrap_p);
+ if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+ || !iv_rgc
+ || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
+ != rgc->max_nscalars_per_iter * rgc->factor))
+ {
+ /* See whether zero-based IV would ever generate all-false masks
+ or zero length before wrapping around. */
+ bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
+
+ /* Set up all controls for this group. */
+ test_ctrl
+ = vect_set_loop_controls_directly (loop, loop_vinfo,
+ &preheader_seq, &header_seq,
+ loop_cond_gsi, rgc, niters,
+ niters_skip, might_wrap_p,
+ &iv_step);
+
+ iv_rgc = rgc;
+ }
+
+ if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+ && rgc->controls.length () > 1)
+ {
+ /* vect_set_loop_controls_directly creates an IV whose step
+ is equal to the expected sum of RGC->controls. Use that
+ information to populate RGC->controls. */
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ gcc_assert (iv_step);
+ vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
+ }
}
/* Emit all accumulated statements. */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index cf10132..5b7a0da 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -973,6 +973,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
vectorizable (false),
can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
using_partial_vectors_p (false),
+ using_decrementing_iv_p (false),
epil_using_partial_vectors_p (false),
partial_load_store_bias (0),
peeling_for_gaps (false),
@@ -2725,6 +2726,17 @@ start_over:
&& !vect_verify_loop_lens (loop_vinfo))
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ /* If we're vectorizing a loop that uses length "controls" and
+ can iterate more than once, we apply decrementing IV approach
+ in loop control. */
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+ && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
+ && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
+ LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
+
/* If we're vectorizing an epilogue loop, the vectorized loop either needs
to be able to handle fewer than VF scalars, or needs to have a lower VF
than the main loop. */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 02d2ad6..fba09b9 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -818,6 +818,13 @@ public:
the vector loop can handle fewer than VF scalars. */
bool using_partial_vectors_p;
+ /* True if we've decided to use a decrementing loop control IV that counts
+ scalars. This can be done for any loop that:
+
+ (a) uses length "controls"; and
+ (b) can iterate more than once. */
+ bool using_decrementing_iv_p;
+
/* True if we've decided to use partially-populated vectors for the
epilogue of loop. */
bool epil_using_partial_vectors_p;
@@ -890,6 +897,7 @@ public:
#define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable
#define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
#define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \
(L)->epil_using_partial_vectors_p
#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias