aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorZdenek Dvorak <dvorakz@suse.cz>2006-11-12 21:05:49 +0100
committerZdenek Dvorak <rakdver@gcc.gnu.org>2006-11-12 20:05:49 +0000
commit911b3fdbe38f8a259a8212773098bf201423a242 (patch)
tree872aef0e893c395cc6fdd9d9fe625854bf2281f4 /gcc
parent946e1bc7575f0676faf87b8f9937e3a2a8b828af (diff)
downloadgcc-911b3fdbe38f8a259a8212773098bf201423a242.zip
gcc-911b3fdbe38f8a259a8212773098bf201423a242.tar.gz
gcc-911b3fdbe38f8a259a8212773098bf201423a242.tar.bz2
tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve comments.
* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve comments. (issue_prefetch_ref): Move assignment to write_p out of loop. (determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and SIMULTANEOUS_PREFETCHES into account. (loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor. * lambda-code.c (lcm): Renamed to ... (least_common_multiple): ... and exported. * tree-flow.h (least_common_multiple): Declare. From-SVN: r118730
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog12
-rw-r--r--gcc/lambda-code.c6
-rw-r--r--gcc/testsuite/ChangeLog4
-rw-r--r--gcc/testsuite/gcc.dg/tree-ssa/prefetch-3.c22
-rw-r--r--gcc/tree-flow.h3
-rw-r--r--gcc/tree-ssa-loop-prefetch.c103
6 files changed, 94 insertions, 56 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 845e235..e29ed31 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,17 @@
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
+ * tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve
+ comments.
+ (issue_prefetch_ref): Move assignment to write_p out of loop.
+ (determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and
+ SIMULTANEOUS_PREFETCHES into account.
+ (loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor.
+ * lambda-code.c (lcm): Renamed to ...
+ (least_common_multiple): ... and exported.
+ * tree-flow.h (least_common_multiple): Declare.
+
+2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
+
* Makefile.in (tree-data-ref.o): Add langhooks.h dependency.
* tree-ssa-loop-niter.c (derive_constant_upper_bound): Follow
ud-chains. Handle AND_EXPR.
diff --git a/gcc/lambda-code.c b/gcc/lambda-code.c
index 2a03fd3..3dfad91 100644
--- a/gcc/lambda-code.c
+++ b/gcc/lambda-code.c
@@ -442,8 +442,8 @@ lambda_lattice_compute_base (lambda_loopnest nest)
/* Compute the least common multiple of two numbers A and B . */
-static int
-lcm (int a, int b)
+int
+least_common_multiple (int a, int b)
{
return (abs (a) * abs (b) / gcd (a, b));
}
@@ -577,7 +577,7 @@ compute_nest_using_fourier_motzkin (int size,
{
if (A[k][i] < 0)
{
- multiple = lcm (A[j][i], A[k][i]);
+ multiple = least_common_multiple (A[j][i], A[k][i]);
f1 = multiple / A[j][i];
f2 = -1 * multiple / A[k][i];
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 5a2b502..71597e2 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
+
+ * gcc.dg/tree-ssa/prefetch-3.c: New test.
+
2006-11-12 Roger Sayle <roger@eyesopen.com>
PR tree-optimization/13827
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/prefetch-3.c b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-3.c
new file mode 100644
index 0000000..c5767a7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-3.c
@@ -0,0 +1,22 @@
+/* Prefetching used to prefer nonsensical unroll factor of 5 in this testcase. */
+
+/* { dg-do compile { target i?86-*-* } } */
+/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -msse2 -mfpmath=sse -fdump-tree-aprefetch-details" } */
+
+#define N 1000000
+
+double a[N];
+
+double test(void)
+{
+ unsigned i;
+ double sum = 0;
+
+ for (i = 0; i < N; i += 2)
+ sum += (a[i] * a[i+1]);
+
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "unroll factor 4" 1 "aprefetch" } } */
+/* { dg-final { cleanup-tree-dump "aprefetch" } } */
diff --git a/gcc/tree-flow.h b/gcc/tree-flow.h
index daa19d4..a880c32 100644
--- a/gcc/tree-flow.h
+++ b/gcc/tree-flow.h
@@ -1036,4 +1036,7 @@ void swap_tree_operands (tree, tree *, tree *);
extern void recalculate_used_alone (void);
extern bool updating_used_alone;
+
+int least_common_multiple (int, int);
+
#endif /* _TREE_FLOW_H */
diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c
index 41ada26..b40fc0f 100644
--- a/gcc/tree-ssa-loop-prefetch.c
+++ b/gcc/tree-ssa-loop-prefetch.c
@@ -744,19 +744,21 @@ static bool
schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
unsigned ahead)
{
- unsigned max_prefetches, n_prefetches;
+ unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
+ unsigned slots_per_prefetch;
struct mem_ref *ref;
bool any = false;
- max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
- if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
- max_prefetches = SIMULTANEOUS_PREFETCHES;
+ /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
+ remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
+ /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
+ AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
+ it will need a prefetch slot. */
+ slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
if (dump_file && (dump_flags & TDF_DETAILS))
- fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
-
- if (!max_prefetches)
- return false;
+ fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
+ slots_per_prefetch);
/* For now we just take memory references one by one and issue
prefetches for as many as possible. The groups are sorted
@@ -769,16 +771,24 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
if (!should_issue_prefetch_p (ref))
continue;
- ref->issue_prefetch_p = true;
-
- /* If prefetch_mod is less then unroll_factor, we need to insert
- several prefetches for the reference. */
+ /* If we need to prefetch the reference each PREFETCH_MOD iterations,
+ and we unroll the loop UNROLL_FACTOR times, we need to insert
+ ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
+ iteration. */
n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
/ ref->prefetch_mod);
- if (max_prefetches <= n_prefetches)
- return true;
+ prefetch_slots = n_prefetches * slots_per_prefetch;
+
+ /* If more than half of the prefetches would be lost anyway, do not
+ issue the prefetch. */
+ if (2 * remaining_prefetch_slots < prefetch_slots)
+ continue;
+
+ ref->issue_prefetch_p = true;
- max_prefetches -= n_prefetches;
+ if (remaining_prefetch_slots <= prefetch_slots)
+ return true;
+ remaining_prefetch_slots -= prefetch_slots;
any = true;
}
@@ -822,6 +832,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
/ ref->prefetch_mod);
addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
+ write_p = ref->write_p ? integer_one_node : integer_zero_node;
for (ap = 0; ap < n_prefetches; ap++)
{
@@ -832,10 +843,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
/* Create the prefetch instruction. */
- write_p = ref->write_p ? integer_one_node : integer_zero_node;
params = tree_cons (NULL_TREE, addr,
tree_cons (NULL_TREE, write_p, NULL_TREE));
-
+
prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
params);
bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
@@ -888,48 +898,36 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
static unsigned
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
- unsigned ahead, unsigned ninsns,
- struct tree_niter_desc *desc)
+ unsigned ninsns, struct tree_niter_desc *desc)
{
- unsigned upper_bound, size_factor, constraint_factor;
- unsigned factor, max_mod_constraint, ahead_factor;
+ unsigned upper_bound;
+ unsigned nfactor, factor, mod_constraint;
struct mem_ref_group *agp;
struct mem_ref *ref;
- upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
-
- /* First check whether the loop is not too large to unroll. */
- size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
- if (size_factor <= 1)
+ /* First check whether the loop is not too large to unroll. We ignore
+ PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
+ from unrolling them enough to make exactly one cache line covered by each
+ iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
+ us from unrolling the loops too many times in cases where we only expect
+ gains from better scheduling and decreasing loop overhead, which is not
+ the case here. */
+ upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
+ if (upper_bound <= 1)
return 1;
- if (size_factor < upper_bound)
- upper_bound = size_factor;
-
- max_mod_constraint = 1;
+ /* Choose the factor so that we may prefetch each cache just once,
+ but bound the unrolling by UPPER_BOUND. */
+ factor = 1;
for (agp = refs; agp; agp = agp->next)
for (ref = agp->refs; ref; ref = ref->next)
- if (should_issue_prefetch_p (ref)
- && ref->prefetch_mod > max_mod_constraint)
- max_mod_constraint = ref->prefetch_mod;
-
- /* Set constraint_factor as large as needed to be able to satisfy the
- largest modulo constraint. */
- constraint_factor = max_mod_constraint;
-
- /* If ahead is too large in comparison with the number of available
- prefetches, unroll the loop as much as needed to be able to prefetch
- at least partially some of the references in the loop. */
- ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
- / SIMULTANEOUS_PREFETCHES);
-
- /* Unroll as much as useful, but bound the code size growth. */
- if (constraint_factor < ahead_factor)
- factor = ahead_factor;
- else
- factor = constraint_factor;
- if (factor > upper_bound)
- factor = upper_bound;
+ if (should_issue_prefetch_p (ref))
+ {
+ mod_constraint = ref->prefetch_mod;
+ nfactor = least_common_multiple (mod_constraint, factor);
+ if (nfactor <= upper_bound)
+ factor = nfactor;
+ }
if (!should_unroll_loop_p (loop, desc, factor))
return 1;
@@ -964,8 +962,7 @@ loop_prefetch_arrays (struct loops *loops, struct loop *loop)
instructions executed per iteration of the loop. */
ninsns = tree_num_loop_insns (loop);
ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
- unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
- &desc);
+ unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);