diff options
author | Jan Hubicka <jh@suse.cz> | 2023-08-06 21:23:31 +0200 |
---|---|---|
committer | Jan Hubicka <jh@suse.cz> | 2023-08-06 21:23:31 +0200 |
commit | 838237aeeba578fc2cf42bfd3ecb9d9a4fb7a2b4 (patch) | |
tree | 2539c5a2f161c15c5d4f109dcdcdef65d0c70374 /gcc | |
parent | 3802297528685aa16c4f43bb1f0cedbdbf22923d (diff) | |
download | gcc-838237aeeba578fc2cf42bfd3ecb9d9a4fb7a2b4.zip gcc-838237aeeba578fc2cf42bfd3ecb9d9a4fb7a2b4.tar.gz gcc-838237aeeba578fc2cf42bfd3ecb9d9a4fb7a2b4.tar.bz2 |
Fix profile update after peeled epilogues
Epilogue peeling expects the scalar loop to have same number of executions as
the vector loop which is true at the beggining of vectorization. However if the
epilogues are vectorized, this is no longer the case. In this situation the
loop preheader is replaced by new guard code with correct profile, however
loop body is left unscaled. This leads to loop that exists more often then
it is entered.
This patch add slogic to scale the frequencies down and also to fix profile
of original preheader where necesary.
Bootstrapped/regtested x86_64-linux, comitted.
gcc/ChangeLog:
* tree-vect-loop-manip.cc (vect_do_peeling): Fix profile update of peeled epilogues.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/vect-bitfield-read-1.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-2.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-3.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-4.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-5.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-6.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-read-7.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-1.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-2.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-3.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-4.c: Check profile consistency.
* gcc.dg/vect/vect-bitfield-write-5.c: Check profile consistency.
* gcc.dg/vect/vect-epilogues-2.c: Check profile consistency.
* gcc.dg/vect/vect-epilogues.c: Check profile consistency.
* gcc.dg/vect/vect-mask-store-move-1.c: Check profile consistency.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-epilogues.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c | 2 | ||||
-rw-r--r-- | gcc/tree-vect-loop-manip.cc | 13 |
16 files changed, 41 insertions, 2 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c index 42e50d9..147c959 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -39,3 +40,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c index a9aeefc..982e6a7 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_shift } */ /* { dg-require-effective-target vect_long_long } */ @@ -42,3 +43,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c index c7d0fd2..f2a43c3 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -43,3 +44,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c index 6a3ed8c..9f6f022 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_shift } */ /* { dg-require-effective-target vect_long_long } */ @@ -44,3 +45,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c index b2889df..662aed1 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -41,3 +42,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c index 2445f53..9b315d6 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -41,3 +42,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c index 4b1ec8a..6d1043d 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ @@ -42,3 +43,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c index 22e6235..7c710cf 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ #include <stdarg.h> @@ -38,3 +39,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c index 0c8291c..3b609183 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_long_long } */ @@ -42,3 +43,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c index 46fcb02..e96da82 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_long_long } */ @@ -43,3 +44,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c index 5a7227a..6644221 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ #include <stdarg.h> @@ -41,3 +42,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c index e0b36e4..386de50 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c +++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-require-effective-target vect_int } */ #include <stdarg.h> @@ -41,3 +42,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c b/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c index b251e1f..63c5e23 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c +++ b/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-do compile } */ int @@ -55,3 +56,4 @@ f6 (int *x, int a) x[a] += 1; return res; } +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-epilogues.c b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c index ab7e8a1..11b8c83 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-epilogues.c +++ b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-do compile } */ /* Copied from PR 88915. */ @@ -17,3 +18,4 @@ void pixel_avg( unsigned char *dst, int i_dst_stride, } /* { dg-final { scan-tree-dump "LOOP EPILOGUE VECTORIZED" "vect" { target vect_multiple_sizes xfail { { arm32 && be } || vect_partial_vectors_usage_2 } } } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c b/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c index 1e06b58..700adf9 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c +++ b/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c @@ -1,3 +1,4 @@ +/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */ /* { dg-do compile } */ /* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */ @@ -16,3 +17,4 @@ void foo (int n) } /* { dg-final { scan-tree-dump-times "Move stmt to created bb" 4 "vect" { target { i?86-*-* x86_64-*-* } xfail { i?86-*-* x86_64-*-* } } } } */ +/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */ diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 9de897d..0e7e223 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -3271,6 +3271,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, adjust_vec_debug_stmts (); scev_reset (); } + basic_block bb_before_epilog = NULL; if (epilog_peeling) { @@ -3290,6 +3291,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, epilog->force_vectorize = false; slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false); + bb_before_epilog = loop_preheader_edge (epilog)->src; /* Scalar version loop may be preferred. In this case, add guard and skip to epilog. Note this only happens when the number of @@ -3317,6 +3319,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, /* Simply propagate profile info from guard_bb to guard_to which is a merge point of control flow. */ + profile_count old_count = guard_to->count; guard_to->count = guard_bb->count; /* Restore the counts of the epilog loop if we didn't use the scalar loop. */ @@ -3332,9 +3335,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, free (bbs); free (original_bbs); } - } + else + scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1); - basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; + /* Only need to handle basic block before epilog loop if it's not + the guard_bb, which is the case when skip_vector is true. */ + if (guard_bb != bb_before_epilog) + bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count (); + bb_before_epilog = loop_preheader_edge (epilog)->src; + } /* If loop is peeled for non-zero constant times, now niters refers to orig_niters - prolog_peeling, it won't overflow even the orig_niters overflows. */ |