aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc')
-rw-r--r--gcc/testsuite/gcc.dg/vect/pr112325.c59
-rw-r--r--gcc/tree-ssa-loop-ivcanon.cc49
2 files changed, 86 insertions, 22 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c b/gcc/testsuite/gcc.dg/vect/pr112325.c
new file mode 100644
index 0000000..71cf409
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+ unsigned short s;
+ __builtin_memcpy(&s, &f, sizeof(unsigned short));
+ return table_f32_f16[s];
+}
+
+typedef struct {
+ ggml_fp16_t d;
+ ggml_fp16_t m;
+ unsigned char qh[4];
+ unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+ float d;
+ float s;
+ char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+ const int qk = 32;
+ const int nb = n / qk;
+
+ const block_q5_1 * restrict x = vx;
+ const block_q8_1 * restrict y = vy;
+
+ float sumf = 0.0;
+
+ for (int i = 0; i < nb; i++) {
+ unsigned qh;
+ __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+ int sumi = 0;
+
+ for (int j = 0; j < qk/2; ++j) {
+ const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+ const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+ const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+ const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+ sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+ }
+
+ sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+ }
+
+ *s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf01713..5ef24a9 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
It is (NUNROLL + 1) * size of loop body with taking into account
the fact that in last copy everything after exit conditional
is dead and that some instructions will be eliminated after
- peeling.
-
- Loop body is likely going to simplify further, this is difficult
- to guess, we just decrease the result by 1/3. */
-
+ peeling. */
static unsigned HOST_WIDE_INT
estimated_unrolled_size (struct loop_size *size,
unsigned HOST_WIDE_INT nunroll)
@@ -453,10 +449,6 @@ estimated_unrolled_size (struct loop_size *size,
unr_insns = 0;
unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
- unr_insns = unr_insns * 2 / 3;
- if (unr_insns <= 0)
- unr_insns = 1;
-
return unr_insns;
}
@@ -734,7 +726,8 @@ try_unroll_loop_completely (class loop *loop,
edge exit, tree niter, bool may_be_zero,
enum unroll_level ul,
HOST_WIDE_INT maxiter,
- dump_user_location_t locus, bool allow_peel)
+ dump_user_location_t locus, bool allow_peel,
+ bool cunrolli)
{
unsigned HOST_WIDE_INT n_unroll = 0;
bool n_unroll_found = false;
@@ -847,8 +840,9 @@ try_unroll_loop_completely (class loop *loop,
/* If the code is going to shrink, we don't need to be extra
cautious on guessing if the unrolling is going to be
- profitable. */
- if (unr_insns
+ profitable.
+ Move from estimated_unrolled_size to unroll small loops. */
+ if (unr_insns * 2 / 3
/* If there is IV variable that will become constant, we
save one instruction in the loop prologue we do not
account otherwise. */
@@ -919,7 +913,13 @@ try_unroll_loop_completely (class loop *loop,
loop->num);
return false;
}
- else if (unr_insns
+ /* Move 2 / 3 reduction from estimated_unrolled_size, but don't reduce
+ unrolled size for innermost loop.
+ 1) It could increase register pressure.
+ 2) Big loop after completely unroll may not be vectorized
+ by BB vectorizer. */
+ else if ((cunrolli && !loop->inner
+ ? unr_insns : unr_insns * 2 / 3)
> (unsigned) param_max_completely_peeled_insns)
{
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1227,7 +1227,7 @@ try_peel_loop (class loop *loop,
static bool
canonicalize_loop_induction_variables (class loop *loop,
bool create_iv, enum unroll_level ul,
- bool try_eval, bool allow_peel)
+ bool try_eval, bool allow_peel, bool cunrolli)
{
edge exit = NULL;
tree niter;
@@ -1314,7 +1314,7 @@ canonicalize_loop_induction_variables (class loop *loop,
dump_user_location_t locus = find_loop_location (loop);
if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
- maxiter, locus, allow_peel))
+ maxiter, locus, allow_peel, cunrolli))
return true;
if (create_iv
@@ -1358,7 +1358,7 @@ canonicalize_induction_variables (void)
{
changed |= canonicalize_loop_induction_variables (loop,
true, UL_SINGLE_ITER,
- true, false);
+ true, false, false);
}
gcc_assert (!need_ssa_update_p (cfun));
@@ -1392,7 +1392,7 @@ canonicalize_induction_variables (void)
static bool
tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
- bitmap father_bbs, class loop *loop)
+ bitmap father_bbs, class loop *loop, bool cunrolli)
{
class loop *loop_father;
bool changed = false;
@@ -1410,7 +1410,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
if (!child_father_bbs)
child_father_bbs = BITMAP_ALLOC (NULL);
if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
- child_father_bbs, inner))
+ child_father_bbs, inner, cunrolli))
{
bitmap_ior_into (father_bbs, child_father_bbs);
bitmap_clear (child_father_bbs);
@@ -1456,7 +1456,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
ul = UL_NO_GROWTH;
if (canonicalize_loop_induction_variables
- (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer))
+ (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
{
/* If we'll continue unrolling, we need to propagate constants
within the new basic blocks to fold away induction variable
@@ -1491,6 +1491,7 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
bool changed;
int iteration = 0;
bool irred_invalidated = false;
+ bool cunrolli = true;
estimate_numbers_of_iterations (cfun);
@@ -1507,10 +1508,15 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
changed = tree_unroll_loops_completely_1 (may_increase_size,
unroll_outer, father_bbs,
- current_loops->tree_root);
+ current_loops->tree_root,
+ cunrolli);
if (changed)
{
unsigned i;
+ /* For the outer loop, considering that the inner loop is completely
+ unrolled, it would expose more optimization opportunities, so it's
+ better to keep 2/3 reduction of estimated unrolled size. */
+ cunrolli = false;
unloop_loops (loops_to_unloop, loops_to_unloop_nunroll,
edges_to_remove, loop_closed_ssa_invalidated,
@@ -1670,8 +1676,7 @@ pass_complete_unroll::execute (function *fun)
re-peeling the same loop multiple times. */
if (flag_peel_loops)
peeled_loops = BITMAP_ALLOC (NULL);
- unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size,
- true);
+ unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, true);
if (peeled_loops)
{
BITMAP_FREE (peeled_loops);