diff options
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 | ||||
-rw-r--r-- | gcc/tree-vect-loop.cc | 21 |
2 files changed, 58 insertions, 3 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c new file mode 100644 index 0000000..bb3b6dc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */ +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */ + +/* Calcualte the vectorized induction with smaller step for an unrolled loop. + + before (suggested_unroll_factor=2): + fmov s30, 8.0e+0 + fmov s31, 4.0e+0 + dup v27.4s, v30.s[0] + dup v28.4s, v31.s[0] + .L6: + mov v30.16b, v31.16b + fadd v31.4s, v31.4s, v27.4s + fadd v29.4s, v30.4s, v28.4s + stp q30, q29, [x0] + add x0, x0, 32 + cmp x1, x0 + bne .L6 + + after: + fmov s31, 4.0e+0 + dup v29.4s, v31.s[0] + .L6: + fadd v30.4s, v31.4s, v29.4s + stp q31, q30, [x0] + add x0, x0, 32 + fadd v31.4s, v29.4s, v30.4s + cmp x0, x1 + bne .L6 */ + +void +foo2 (float *arr, float freq, float step) +{ + for (int i = 0; i < 1024; i++) + { + arr[i] = freq; + freq += step; + } +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 36d19a5..7d917bf 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10098,7 +10098,7 @@ vectorizable_induction (loop_vec_info loop_vinfo, new_vec, step_vectype, NULL); vec_def = induc_def; - for (i = 1; i < ncopies; i++) + for (i = 1; i < ncopies + 1; i++) { /* vec_i = vec_prev + vec_step */ gimple_seq stmts = NULL; @@ -10108,8 +10108,23 @@ vectorizable_induction (loop_vec_info loop_vinfo, vec_def = gimple_convert (&stmts, vectype, vec_def); gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); - new_stmt = SSA_NAME_DEF_STMT (vec_def); - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + if (i < ncopies) + { + new_stmt = SSA_NAME_DEF_STMT (vec_def); + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + } + else + { + /* vec_1 = vec_iv + (VF/n * S) + vec_2 = vec_1 + (VF/n * S) + ... + vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop + + vec_n is used as vec_loop to save the large step register and + related operations. */ + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), + UNKNOWN_LOCATION); + } } } |