aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorZdenek Dvorak <rakdver@gcc.gnu.org>2007-05-29 21:55:47 +0000
committerZdenek Dvorak <rakdver@gcc.gnu.org>2007-05-29 21:55:47 +0000
commit5417e0224b5fef840b1faa50c43dc660579e4f7b (patch)
treee755e23b8c84859e189788b0f53371eb79533cf9 /gcc
parentcd5ecab6a73af62791d39b85db942600ccc37dad (diff)
downloadgcc-5417e0224b5fef840b1faa50c43dc660579e4f7b.zip
gcc-5417e0224b5fef840b1faa50c43dc660579e4f7b.tar.gz
gcc-5417e0224b5fef840b1faa50c43dc660579e4f7b.tar.bz2
tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer.
* tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer. (SET_DR_MISALIGNMENT): New. * tree-vect-analyze.c (vect_compute_data_ref_alignment, vect_update_misalignment_for_peel, vect_enhance_data_refs_alignment): Use SET_DR_MISALIGNMENT. * tree-predcom.c (split_data_refs_to_components): Cast dr->aux from pointer. * tree-data-ref.c (create_data_ref, compute_all_dependences, find_loop_nest): Export. * tree-data-ref.h (struct data_reference): Change aux field to pointer. (create_data_ref, compute_all_dependences, find_loop_nest): Declare. * tree-ssa-loop-prefetch.c: Include tree-data-ref.h. (L1_CACHE_SIZE_BYTES, L2_CACHE_SIZE_BYTES, NONTEMPORAL_FRACTION): New macros. (struct mem_ref): Add field reuse_distance. (find_or_create_group, record_ref): Use XNEW instead of xcalloc. Initialize reuse_distance field. (issue_prefetch_ref): Select temporality of prefetch according to reuse_distance. (volume_of_references, volume_of_dist_vector, add_subscript_strides, self_reuse_distance, determine_loop_nest_reuse): New functions. (loop_prefetch_arrays): Call determine_loop_nest_reuse. (tree_ssa_prefetch_arrays): Dump L2 cache size. * Makefile.in (tree-ssa-loop-prefetch.o): Add TREE_DATA_REF_H dependency. * gcc.dg/tree-ssa/prefetch-6.c: New test. From-SVN: r125172
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog30
-rw-r--r--gcc/Makefile.in2
-rw-r--r--gcc/testsuite/ChangeLog4
-rw-r--r--gcc/testsuite/gcc.dg/tree-ssa/prefetch-6.c54
-rw-r--r--gcc/tree-data-ref.c6
-rw-r--r--gcc/tree-data-ref.h7
-rw-r--r--gcc/tree-predcom.c10
-rw-r--r--gcc/tree-ssa-loop-prefetch.c347
-rw-r--r--gcc/tree-vect-analyze.c20
-rw-r--r--gcc/tree-vectorizer.h3
10 files changed, 453 insertions, 30 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 3dec250..9c74295 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,9 +1,37 @@
+2007-05-29 Zdenek Dvorak <dvorakz@suse.cz>
+
+ * tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer.
+ (SET_DR_MISALIGNMENT): New.
+ * tree-vect-analyze.c (vect_compute_data_ref_alignment,
+ vect_update_misalignment_for_peel, vect_enhance_data_refs_alignment):
+ Use SET_DR_MISALIGNMENT.
+ * tree-predcom.c (split_data_refs_to_components): Cast dr->aux from
+ pointer.
+ * tree-data-ref.c (create_data_ref, compute_all_dependences,
+ find_loop_nest): Export.
+ * tree-data-ref.h (struct data_reference): Change aux field to pointer.
+ (create_data_ref, compute_all_dependences, find_loop_nest): Declare.
+ * tree-ssa-loop-prefetch.c: Include tree-data-ref.h.
+ (L1_CACHE_SIZE_BYTES, L2_CACHE_SIZE_BYTES, NONTEMPORAL_FRACTION):
+ New macros.
+ (struct mem_ref): Add field reuse_distance.
+ (find_or_create_group, record_ref): Use XNEW instead of xcalloc.
+ Initialize reuse_distance field.
+ (issue_prefetch_ref): Select temporality of prefetch according to
+ reuse_distance.
+ (volume_of_references, volume_of_dist_vector, add_subscript_strides,
+ self_reuse_distance, determine_loop_nest_reuse): New functions.
+ (loop_prefetch_arrays): Call determine_loop_nest_reuse.
+ (tree_ssa_prefetch_arrays): Dump L2 cache size.
+ * Makefile.in (tree-ssa-loop-prefetch.o): Add TREE_DATA_REF_H
+ dependency.
+
2007-05-29 Daniel Berlin <dberlin@dberlin.org>
* tree-ssa-alias.c: Add aliasing overview.
2007-05-29 Zuxy Meng <zuxy.meng@gmail.com>
- Danny Smith <dannysmith@users.sourceforge.net>
+ Danny Smith <dannysmith@users.sourceforge.net>
PR target/29498
* config/i386/t-crtfm: Compile crtfastmath.o with
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 8064b2f..605a0bd 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -2084,7 +2084,7 @@ tree-ssa-loop-prefetch.o: tree-ssa-loop-prefetch.c $(TREE_FLOW_H) $(CONFIG_H) \
output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
tree-pass.h $(GGC_H) $(RECOG_H) insn-config.h $(HASHTAB_H) $(SCEV_H) \
$(CFGLOOP_H) $(PARAMS_H) langhooks.h $(BASIC_BLOCK_H) hard-reg-set.h \
- tree-chrec.h toplev.h langhooks.h $(TREE_INLINE_H)
+ tree-chrec.h toplev.h langhooks.h $(TREE_INLINE_H) $(TREE_DATA_REF_H)
tree-predcom.o: tree-predcom.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(TM_P_H) \
$(CFGLOOP_H) $(TREE_FLOW_H) $(GGC_H) $(TREE_DATA_REF_H) $(SCEV_H) \
$(PARAMS_H) $(DIAGNOSTIC_H) tree-pass.h $(TM_H) coretypes.h tree-affine.h \
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 696a478..6340ca4 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2007-05-29 Zdenek Dvorak <dvorakz@suse.cz>
+
+ * gcc.dg/tree-ssa/prefetch-6.c: New test.
+
2007-05-29 Tobias Schlüter <tobi@gcc.gnu.org>
* gfortran.dg/sizeof.f90: New.
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/prefetch-6.c b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-6.c
new file mode 100644
index 0000000..44f65bf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-6.c
@@ -0,0 +1,54 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -msse2 -mfpmath=sse --param simultaneous-prefetches=100 -fdump-tree-aprefetch-details" } */
+
+#define N 1000
+#define K 900
+
+double a[N][N];
+
+double test(void)
+{
+ unsigned i, j;
+ double sum = 0;
+
+ /* Here, we should use non-temporal prefetch instruction. */
+ for (i = 0; i < K; i++)
+ for (j = 0; j < K; j++)
+ sum += a[i][j];
+
+ /* Here, we should not use non-temporal prefetch instruction, since the
+ value of a[i+10][j] is reused in L2 cache. */
+ for (i = 0; i < K; i++)
+ for (j = 0; j < K; j++)
+ sum += a[i][j] * a[i + 10][j];
+
+ /* Here, we should use non-temporal prefetch instruction, since the
+ value of a[i+100][j] is too far to be reused in L2 cache. */
+ for (i = 0; i < K; i++)
+ for (j = 0; j < K; j++)
+ sum += a[i][j] * a[i + 100][j];
+
+ /* Here, temporal prefetches should be used, since the volume of the
+ memory accesses is smaller than L2 cache. */
+ for (i = 0; i < 100; i++)
+ for (j = 0; j < 100; j++)
+ sum += a[i][j] * a[i + 100][j];
+
+ /* Temporal prefetches should be used here (even though the accesses to
+ a[j][i] are independent, the same cache line is almost always hit
+ every N iterations). */
+ for (i = 0; i < N; i++)
+ for (j = 0; j < N; j++)
+ sum += a[j][i];
+
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "Issued prefetch" 5 "aprefetch" } } */
+/* { dg-final { scan-tree-dump-times "Issued nontemporal prefetch" 3 "aprefetch" } } */
+
+/* { dg-final { scan-assembler-times "prefetcht" 5 } } */
+/* { dg-final { scan-assembler-times "prefetchnta" 3 } } */
+
+/* { dg-final { cleanup-tree-dump "aprefetch" } } */
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index ef3353f..99f8fd2 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -771,7 +771,7 @@ free_data_ref (data_reference_p dr)
data_reference description of MEMREF. NEST is the outermost loop of the
loop nest in that the reference should be analysed. */
-static struct data_reference *
+struct data_reference *
create_data_ref (struct loop *nest, tree memref, tree stmt, bool is_read)
{
struct data_reference *dr;
@@ -3843,7 +3843,7 @@ compute_self_dependence (struct data_dependence_relation *ddr)
COMPUTE_SELF_AND_RR is FALSE, don't compute read-read and self
relations. */
-static void
+void
compute_all_dependences (VEC (data_reference_p, heap) *datarefs,
VEC (ddr_p, heap) **dependence_relations,
VEC (loop_p, heap) *loop_nest,
@@ -4055,7 +4055,7 @@ find_loop_nest_1 (struct loop *loop, VEC (loop_p, heap) **loop_nest)
contain the loops from the outermost to the innermost, as they will
appear in the classic distance vector. */
-static bool
+bool
find_loop_nest (struct loop *loop, VEC (loop_p, heap) **loop_nest)
{
VEC_safe_push (loop_p, heap, *loop_nest, loop);
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index b53c6f1..ddff36a 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -104,7 +104,7 @@ struct data_reference
tree ref;
/* Auxiliary info specific to a pass. */
- int aux;
+ void *aux;
/* True when the data reference is in RHS of a stmt. */
bool is_read;
@@ -320,7 +320,10 @@ extern void dump_data_dependence_direction (FILE *,
extern void free_dependence_relation (struct data_dependence_relation *);
extern void free_dependence_relations (VEC (ddr_p, heap) *);
extern void free_data_refs (VEC (data_reference_p, heap) *);
-
+struct data_reference *create_data_ref (struct loop *, tree, tree, bool);
+bool find_loop_nest (struct loop *, VEC (loop_p, heap) **);
+void compute_all_dependences (VEC (data_reference_p, heap) *,
+ VEC (ddr_p, heap) **, VEC (loop_p, heap) *, bool);
/* Return the index of the variable VAR in the LOOP_NEST array. */
diff --git a/gcc/tree-predcom.c b/gcc/tree-predcom.c
index bae23ed..a3a137f 100644
--- a/gcc/tree-predcom.c
+++ b/gcc/tree-predcom.c
@@ -700,7 +700,7 @@ split_data_refs_to_components (struct loop *loop,
just fail. */
goto end;
}
- dr->aux = i;
+ dr->aux = (void *) (size_t) i;
comp_father[i] = i;
comp_size[i] = 1;
}
@@ -715,7 +715,7 @@ split_data_refs_to_components (struct loop *loop,
if (!suitable_reference_p (dr, &dummy))
{
- ia = dr->aux;
+ ia = (unsigned) (size_t) dr->aux;
merge_comps (comp_father, comp_size, n, ia);
}
}
@@ -729,8 +729,8 @@ split_data_refs_to_components (struct loop *loop,
dra = DDR_A (ddr);
drb = DDR_B (ddr);
- ia = component_of (comp_father, dra->aux);
- ib = component_of (comp_father, drb->aux);
+ ia = component_of (comp_father, (unsigned) (size_t) dra->aux);
+ ib = component_of (comp_father, (unsigned) (size_t) drb->aux);
if (ia == ib)
continue;
@@ -749,7 +749,7 @@ split_data_refs_to_components (struct loop *loop,
bad = component_of (comp_father, n);
for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
{
- ia = dr->aux;
+ ia = (unsigned) (size_t) dr->aux;
ca = component_of (comp_father, ia);
if (ca == bad)
continue;
diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c
index 35e8021..3159748 100644
--- a/gcc/tree-ssa-loop-prefetch.c
+++ b/gcc/tree-ssa-loop-prefetch.c
@@ -46,6 +46,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
#include "params.h"
#include "langhooks.h"
#include "tree-inline.h"
+#include "tree-data-ref.h"
/* This pass inserts prefetch instructions to optimize cache usage during
accesses to arrays in loops. It processes loops sequentially and:
@@ -82,6 +83,10 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
7/32.
(5) has PREFETCH_MOD 1 as well.
+ Additionally, we use data dependence analysis to determine for each
+ reference the distance till the first reuse; this information is used
+ to determine the temporality of the issued prefetch instruction.
+
3) We determine how much ahead we need to prefetch. The number of
iterations needed is time to fetch / time spent in one iteration of
the loop. The problem is that we do not know either of these values,
@@ -161,6 +166,17 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
#define HAVE_prefetch 0
#endif
+#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * L1_CACHE_LINE_SIZE))
+/* TODO: Add parameter to specify L2 cache size. */
+#define L2_CACHE_SIZE_BYTES (8 * L1_CACHE_SIZE_BYTES)
+
+/* We consider a memory access nontemporal if it is not reused sooner than
+ after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
+ accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
+ so that we use nontemporal prefetches e.g. if single memory location
+ is accessed several times in a single iteration of the loop. */
+#define NONTEMPORAL_FRACTION 16
+
/* The group of references between that reuse may occur. */
struct mem_ref_group
@@ -190,6 +206,8 @@ struct mem_ref
unsigned HOST_WIDE_INT prefetch_before;
/* Prefetch only first PREFETCH_BEFORE
iterations. */
+ unsigned reuse_distance; /* The amount of data accessed before the first
+ reuse of this value. */
bool issue_prefetch_p; /* Should we really issue the prefetch? */
struct mem_ref *next; /* The next reference in the group. */
};
@@ -236,7 +254,7 @@ find_or_create_group (struct mem_ref_group **groups, tree base,
break;
}
- group = xcalloc (1, sizeof (struct mem_ref_group));
+ group = XNEW (struct mem_ref_group);
group->base = base;
group->step = step;
group->refs = NULL;
@@ -273,13 +291,14 @@ record_ref (struct mem_ref_group *group, tree stmt, tree mem,
return;
}
- (*aref) = xcalloc (1, sizeof (struct mem_ref));
+ (*aref) = XNEW (struct mem_ref);
(*aref)->stmt = stmt;
(*aref)->mem = mem;
(*aref)->delta = delta;
(*aref)->write_p = write_p;
(*aref)->prefetch_before = PREFETCH_ALL;
(*aref)->prefetch_mod = 1;
+ (*aref)->reuse_distance = 0;
(*aref)->issue_prefetch_p = false;
(*aref)->group = group;
(*aref)->next = NULL;
@@ -815,12 +834,15 @@ static void
issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
{
HOST_WIDE_INT delta;
- tree addr, addr_base, prefetch, write_p;
+ tree addr, addr_base, prefetch, write_p, local;
block_stmt_iterator bsi;
unsigned n_prefetches, ap;
+ bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
if (dump_file && (dump_flags & TDF_DETAILS))
- fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
+ fprintf (dump_file, "Issued%s prefetch for %p.\n",
+ nontemporal ? " nontemporal" : "",
+ (void *) ref);
bsi = bsi_for_stmt (ref->stmt);
@@ -829,6 +851,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
write_p = ref->write_p ? integer_one_node : integer_zero_node;
+ local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
for (ap = 0; ap < n_prefetches; ap++)
{
@@ -840,7 +863,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
/* Create the prefetch instruction. */
prefetch = build_call_expr (built_in_decls[BUILT_IN_PREFETCH],
- 2, addr, write_p);
+ 3, addr, write_p, local);
bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
}
}
@@ -935,6 +958,311 @@ determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
return factor;
}
+/* Returns the total volume of the memory references REFS, taking into account
+ reuses in the innermost loop and cache line size. TODO -- we should also
+ take into account reuses across the iterations of the loops in the loop
+ nest. */
+
+static unsigned
+volume_of_references (struct mem_ref_group *refs)
+{
+ unsigned volume = 0;
+ struct mem_ref_group *gr;
+ struct mem_ref *ref;
+
+ for (gr = refs; gr; gr = gr->next)
+ for (ref = gr->refs; ref; ref = ref->next)
+ {
+ /* Almost always reuses another value? */
+ if (ref->prefetch_before != PREFETCH_ALL)
+ continue;
+
+ /* If several iterations access the same cache line, use the size of
+ the line divided by this number. Otherwise, a cache line is
+ accessed in each iteration. TODO -- in the latter case, we should
+ take the size of the reference into account, rounding it up on cache
+ line size multiple. */
+ volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
+ }
+ return volume;
+}
+
+/* Returns the volume of memory references accessed across VEC iterations of
+ loops, whose sizes are described in the LOOP_SIZES array. N is the number
+ of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
+
+static unsigned
+volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
+{
+ unsigned i;
+
+ for (i = 0; i < n; i++)
+ if (vec[i] != 0)
+ break;
+
+ if (i == n)
+ return 0;
+
+ gcc_assert (vec[i] > 0);
+
+ /* We ignore the parts of the distance vector in subloops, since usually
+ the numbers of iterations are much smaller. */
+ return loop_sizes[i] * vec[i];
+}
+
+/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
+ at the position corresponding to the loop of the step. N is the depth
+ of the considered loop nest, and, LOOP is its innermost loop. */
+
+static void
+add_subscript_strides (tree access_fn, unsigned stride,
+ HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
+{
+ struct loop *aloop;
+ tree step;
+ HOST_WIDE_INT astep;
+ unsigned min_depth = loop_depth (loop) - n;
+
+ while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
+ {
+ aloop = get_chrec_loop (access_fn);
+ step = CHREC_RIGHT (access_fn);
+ access_fn = CHREC_LEFT (access_fn);
+
+ if ((unsigned) loop_depth (aloop) <= min_depth)
+ continue;
+
+ if (host_integerp (step, 0))
+ astep = tree_low_cst (step, 0);
+ else
+ astep = L1_CACHE_LINE_SIZE;
+
+ strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
+
+ }
+}
+
+/* Returns the volume of memory references accessed between two consecutive
+ self-reuses of the reference DR. We consider the subscripts of DR in N
+ loops, and LOOP_SIZES contains the volumes of accesses in each of the
+ loops. LOOP is the innermost loop of the current loop nest. */
+
+static unsigned
+self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
+ struct loop *loop)
+{
+ tree stride, access_fn;
+ HOST_WIDE_INT *strides, astride;
+ VEC (tree, heap) *access_fns;
+ tree ref = DR_REF (dr);
+ unsigned i, ret = ~0u;
+
+ /* In the following example:
+
+ for (i = 0; i < N; i++)
+ for (j = 0; j < N; j++)
+ use (a[j][i]);
+ the same cache line is accessed each N steps (except if the change from
+ i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
+ we cannot rely purely on the results of the data dependence analysis.
+
+ Instead, we compute the stride of the reference in each loop, and consider
+ the innermost loop in that the stride is less than cache size. */
+
+ strides = XCNEWVEC (HOST_WIDE_INT, n);
+ access_fns = DR_ACCESS_FNS (dr);
+
+ for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++)
+ {
+ /* Keep track of the reference corresponding to the subscript, so that we
+ know its stride. */
+ while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
+ ref = TREE_OPERAND (ref, 0);
+
+ if (TREE_CODE (ref) == ARRAY_REF)
+ {
+ stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
+ if (host_integerp (stride, 1))
+ astride = tree_low_cst (stride, 1);
+ else
+ astride = L1_CACHE_LINE_SIZE;
+
+ ref = TREE_OPERAND (ref, 0);
+ }
+ else
+ astride = 1;
+
+ add_subscript_strides (access_fn, astride, strides, n, loop);
+ }
+
+ for (i = n; i-- > 0; )
+ {
+ unsigned HOST_WIDE_INT s;
+
+ s = strides[i] < 0 ? -strides[i] : strides[i];
+
+ if (s < (unsigned) L1_CACHE_LINE_SIZE
+ && (loop_sizes[i]
+ > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
+ {
+ ret = loop_sizes[i];
+ break;
+ }
+ }
+
+ free (strides);
+ return ret;
+}
+
+/* Determines the distance till the first reuse of each reference in REFS
+ in the loop nest of LOOP. */
+
+static void
+determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs)
+{
+ struct loop *nest, *aloop;
+ VEC (data_reference_p, heap) *datarefs = NULL;
+ VEC (ddr_p, heap) *dependences = NULL;
+ struct mem_ref_group *gr;
+ struct mem_ref *ref;
+ VEC (loop_p, heap) *vloops = NULL;
+ unsigned *loop_data_size;
+ unsigned i, j, n;
+ unsigned volume, dist, adist;
+ HOST_WIDE_INT vol;
+ data_reference_p dr;
+ ddr_p dep;
+
+ if (loop->inner)
+ return;
+
+ /* Find the outermost loop of the loop nest of loop (we require that
+ there are no sibling loops inside the nest). */
+ nest = loop;
+ while (1)
+ {
+ aloop = loop_outer (nest);
+
+ if (aloop == current_loops->tree_root
+ || aloop->inner->next)
+ break;
+
+ nest = aloop;
+ }
+
+ /* For each loop, determine the amount of data accessed in each iteration.
+ We use this to estimate whether the reference is evicted from the
+ cache before its reuse. */
+ find_loop_nest (nest, &vloops);
+ n = VEC_length (loop_p, vloops);
+ loop_data_size = XNEWVEC (unsigned, n);
+ volume = volume_of_references (refs);
+ i = n;
+ while (i-- != 0)
+ {
+ loop_data_size[i] = volume;
+ /* Bound the volume by the L2 cache size, since above this bound,
+ all dependence distances are equivalent. */
+ if (volume > L2_CACHE_SIZE_BYTES)
+ continue;
+
+ aloop = VEC_index (loop_p, vloops, i);
+ vol = estimated_loop_iterations_int (aloop, false);
+ if (vol < 0)
+ vol = expected_loop_iterations (aloop);
+ volume *= vol;
+ }
+
+ /* Prepare the references in the form suitable for data dependence
+ analysis. We ignore unanalysable data references (the results
+ are used just as a heuristics to estimate temporality of the
+ references, hence we do not need to worry about correctness). */
+ for (gr = refs; gr; gr = gr->next)
+ for (ref = gr->refs; ref; ref = ref->next)
+ {
+ dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p);
+
+ if (dr)
+ {
+ ref->reuse_distance = volume;
+ dr->aux = ref;
+ VEC_safe_push (data_reference_p, heap, datarefs, dr);
+ }
+ }
+
+ for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
+ {
+ dist = self_reuse_distance (dr, loop_data_size, n, loop);
+ ref = dr->aux;
+ if (ref->reuse_distance > dist)
+ ref->reuse_distance = dist;
+ }
+
+ compute_all_dependences (datarefs, &dependences, vloops, true);
+
+ for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++)
+ {
+ if (DDR_ARE_DEPENDENT (dep) == chrec_known)
+ continue;
+
+ if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
+ || DDR_NUM_DIST_VECTS (dep) == 0)
+ {
+ /* If the dependence cannot be analysed, assume that there might be
+ a reuse. */
+ dist = 0;
+ }
+ else
+ {
+ /* The distance vectors are normalised to be always lexicographically
+ positive, hence we cannot tell just from them whether DDR_A comes
+ before DDR_B or vice versa. However, it is not important,
+ anyway -- if DDR_A is close to DDR_B, then it is either reused in
+ DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
+ in cache (and marking it as nontemporal would not affect
+ anything). */
+
+ dist = volume;
+ for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
+ {
+ adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
+ loop_data_size, n);
+
+ /* Ignore accesses closer than
+ L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
+ so that we use nontemporal prefetches e.g. if single memory
+ location is accessed several times in a single iteration of
+ the loop. */
+ if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
+ continue;
+
+ if (adist < dist)
+ dist = adist;
+ }
+ }
+
+ ref = DDR_A (dep)->aux;
+ if (ref->reuse_distance > dist)
+ ref->reuse_distance = dist;
+ ref = DDR_B (dep)->aux;
+ if (ref->reuse_distance > dist)
+ ref->reuse_distance = dist;
+ }
+
+ free_dependence_relations (dependences);
+ free_data_refs (datarefs);
+ free (loop_data_size);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Reuse distances:\n");
+ for (gr = refs; gr; gr = gr->next)
+ for (ref = gr->refs; ref; ref = ref->next)
+ fprintf (dump_file, " ref %p distance %u\n",
+ (void *) ref, ref->reuse_distance);
+ }
+}
+
/* Issue prefetch instructions for array references in LOOP. Returns
true if the LOOP was unrolled. */
@@ -963,6 +1291,8 @@ loop_prefetch_arrays (struct loop *loop)
if (!anything_to_prefetch_p (refs))
goto fail;
+ determine_loop_nest_reuse (loop, refs);
+
/* Step 3: determine the ahead and unroll factor. */
/* FIXME: the time should be weighted by the probabilities of the blocks in
@@ -1034,10 +1364,11 @@ tree_ssa_prefetch_arrays (void)
fprintf (dump_file, " simultaneous prefetches: %d\n",
SIMULTANEOUS_PREFETCHES);
fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
- fprintf (dump_file, " L1 cache size: %d (%d bytes)\n",
- L1_CACHE_SIZE, L1_CACHE_SIZE * L1_CACHE_LINE_SIZE);
- fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
+ fprintf (dump_file, " L1 cache size: %d lines, %d bytes\n",
+ L1_CACHE_SIZE, L1_CACHE_SIZE_BYTES);
+ fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
+ fprintf (dump_file, " L2 cache size: %d bytes\n", L2_CACHE_SIZE_BYTES);
fprintf (dump_file, "\n");
}
diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c
index 6636cfd..647d98f 100644
--- a/gcc/tree-vect-analyze.c
+++ b/gcc/tree-vect-analyze.c
@@ -1128,7 +1128,7 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
fprintf (vect_dump, "vect_compute_data_ref_alignment:");
/* Initialize misalignment to unknown. */
- DR_MISALIGNMENT (dr) = -1;
+ SET_DR_MISALIGNMENT (dr, -1);
misalign = DR_INIT (dr);
aligned_to = DR_ALIGNED_TO (dr);
@@ -1198,7 +1198,7 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
return false;
}
- DR_MISALIGNMENT (dr) = TREE_INT_CST_LOW (misalign);
+ SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
if (vect_print_dump_info (REPORT_DETAILS))
{
@@ -1267,21 +1267,23 @@ vect_update_misalignment_for_peel (struct data_reference *dr,
continue;
gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
DR_MISALIGNMENT (dr_peel) / dr_peel_size);
- DR_MISALIGNMENT (dr) = 0;
+ SET_DR_MISALIGNMENT (dr, 0);
return;
}
if (known_alignment_for_access_p (dr)
&& known_alignment_for_access_p (dr_peel))
{
- DR_MISALIGNMENT (dr) += npeel * dr_size;
- DR_MISALIGNMENT (dr) %= UNITS_PER_SIMD_WORD;
+ int misal = DR_MISALIGNMENT (dr);
+ misal += npeel * dr_size;
+ misal %= UNITS_PER_SIMD_WORD;
+ SET_DR_MISALIGNMENT (dr, misal);
return;
}
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "Setting misalignment to -1.");
- DR_MISALIGNMENT (dr) = -1;
+ SET_DR_MISALIGNMENT (dr, -1);
}
@@ -1577,7 +1579,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
save_misalignment = DR_MISALIGNMENT (dr);
vect_update_misalignment_for_peel (dr, dr0, npeel);
supportable_dr_alignment = vect_supportable_dr_alignment (dr);
- DR_MISALIGNMENT (dr) = save_misalignment;
+ SET_DR_MISALIGNMENT (dr, save_misalignment);
if (!supportable_dr_alignment)
{
@@ -1601,7 +1603,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
- DR_MISALIGNMENT (dr0) = 0;
+ SET_DR_MISALIGNMENT (dr0, 0);
if (vect_print_dump_info (REPORT_ALIGNMENT))
fprintf (vect_dump, "Alignment of access forced using peeling.");
@@ -1702,7 +1704,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
dr = STMT_VINFO_DATA_REF (stmt_info);
- DR_MISALIGNMENT (dr) = 0;
+ SET_DR_MISALIGNMENT (dr, 0);
if (vect_print_dump_info (REPORT_ALIGNMENT))
fprintf (vect_dump, "Alignment of access forced using versioning.");
}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index dd8e2b8a..6e20652 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -339,7 +339,8 @@ is_pattern_stmt_p (stmt_vec_info stmt_info)
/* Reflects actual alignment of first access in the vectorized loop,
taking into account peeling/versioning if applied. */
-#define DR_MISALIGNMENT(DR) (DR)->aux
+#define DR_MISALIGNMENT(DR) ((int) (size_t) (DR)->aux)
+#define SET_DR_MISALIGNMENT(DR, VAL) ((DR)->aux = (void *) (size_t) (VAL))
static inline bool
aligned_access_p (struct data_reference *data_ref_info)