5 files changed, 192 insertions, 18 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b0e26a9..9c75bc2 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2007-03-01  Zdenek Dvorak  <dvorakz@suse.cz>
+
+	* tree-ssa-loop-prefetch.c (determine_unroll_factor):  Bound the unroll
+	factor by the estimated number of iterations.
+	(loop_prefetch_arrays): Do not prefetch in loops that iterate less than
+	prefetch latency.
+
 2007-03-01  Richard Henderson  <rth@redhat.com>
 
 	* expr.c (emit_move_complex_push): Export.
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index df43512..3a5d29d 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -47,6 +47,131 @@ const char *host_detect_local_cpu (int argc, const char **argv);
 #define bit_3DNOWP (1 << 30)
 #define bit_LM (1 << 29)
 
+/* Returns parameters that describe L1_ASSOC associative cache of size
+   L1_SIZEKB with lines of size L1_LINE.  */
+
+static char *
+describe_cache (unsigned l1_sizekb, unsigned l1_line,
+		unsigned l1_assoc ATTRIBUTE_UNUSED)
+{
+  char size[1000], line[1000];
+  unsigned size_in_lines;
+
+  /* At the moment, gcc middle-end does not use the information about the
+     associativity of the cache.  */
+
+  size_in_lines = (l1_sizekb * 1024) / l1_line;
+
+  sprintf (size, "--param l1-cache-size=%u", size_in_lines);
+  sprintf (line, "--param l1-cache-line-size=%u", l1_line);
+
+  return concat (size, " ", line, " ", NULL);
+}
+
+/* Returns the description of caches for an AMD processor.  */
+
+static char *
+detect_caches_amd (unsigned max_ext_level)
+{
+  unsigned eax, ebx, ecx, edx;
+  unsigned l1_sizekb, l1_line, l1_assoc;
+
+  if (max_ext_level < 0x80000005)
+    return NULL;
+
+  cpuid (0x80000005, eax, ebx, ecx, edx);
+
+  l1_line = ecx & 0xff;
+  l1_sizekb = (ecx >> 24) & 0xff;
+  l1_assoc = (ecx >> 16) & 0xff;
+
+  return describe_cache (l1_sizekb, l1_line, l1_assoc);
+}
+
+/* Stores the size of the L1 cache and cache line, and the associativity
+   of the cache according to REG to L1_SIZEKB, L1_LINE and L1_ASSOC.  */
+
+static void
+decode_caches_intel (unsigned reg, unsigned *l1_sizekb, unsigned *l1_line,
+		     unsigned *l1_assoc)
+{
+  unsigned i, val;
+
+  if (((reg >> 31) & 1) != 0)
+    return;
+
+  for (i = 0; i < 4; i++)
+    {
+      val = reg & 0xff;
+      reg >>= 8;
+
+      switch (val)
+	{
+	case 0xa:
+	  *l1_sizekb = 8;
+	  *l1_line = 32;
+	  *l1_assoc = 2;
+	  break;
+	case 0xc:
+	  *l1_sizekb = 16;
+	  *l1_line = 32;
+	  *l1_assoc = 4;
+	  break;
+	case 0x2c:
+	  *l1_sizekb = 32;
+	  *l1_line = 64;
+	  *l1_assoc = 8;
+	  break;
+	case 0x60:
+	  *l1_sizekb = 16;
+	  *l1_line = 64;
+	  *l1_assoc = 8;
+	  break;
+	case 0x66:
+	  *l1_sizekb = 8;
+	  *l1_line = 64;
+	  *l1_assoc = 4;
+	  break;
+	case 0x67:
+	  *l1_sizekb = 16;
+	  *l1_line = 64;
+	  *l1_assoc = 4;
+	  break;
+	case 0x68:
+	  *l1_sizekb = 32;
+	  *l1_line = 64;
+	  *l1_assoc = 4;
+	  break;
+
+	default:
+	  break;
+	}
+    }
+}
+
+/* Returns the description of caches for an intel processor.  */
+
+static char *
+detect_caches_intel (unsigned max_level)
+{
+  unsigned eax, ebx, ecx, edx;
+  unsigned l1_sizekb = 0, l1_line = 0, assoc = 0;
+
+  if (max_level < 2)
+    return NULL;
+
+  cpuid (2, eax, ebx, ecx, edx);
+
+  decode_caches_intel (eax, &l1_sizekb, &l1_line, &assoc);
+  decode_caches_intel (ebx, &l1_sizekb, &l1_line, &assoc);
+  decode_caches_intel (ecx, &l1_sizekb, &l1_line, &assoc);
+  decode_caches_intel (edx, &l1_sizekb, &l1_line, &assoc);
+  if (!l1_sizekb)
+    return (char *) "";
+
+  return describe_cache (l1_sizekb, l1_line, assoc);
+}
+
 /* This will be called by the spec parser in gcc.c when it sees
    a %:local_cpu_detect(args) construct.  Currently it will be called
    with either "arch" or "tune" as argument depending on if -march=native
@@ -62,6 +187,7 @@ const char *host_detect_local_cpu (int argc, const char **argv);
 const char *host_detect_local_cpu (int argc, const char **argv)
 {
   const char *cpu = NULL;
+  const char *cache = "";
   enum processor_type processor = PROCESSOR_I386;
   unsigned int eax, ebx, ecx, edx;
   unsigned int max_level;
@@ -126,6 +252,14 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 
   is_amd = vendor == *(unsigned int*)"Auth";
 
+  if (!arch)
+    {
+      if (is_amd)
+	cache = detect_caches_amd (ext_level);
+      else if (vendor == *(unsigned int*)"Genu")
+	cache = detect_caches_intel (max_level);
+    }
+
   if (is_amd)
     {
       if (has_mmx)
@@ -283,7 +417,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
     }
 
 done:
-  return concat ("-m", argv[0], "=", cpu, NULL);
+  return concat (cache, "-m", argv[0], "=", cpu, NULL);
 }
 #else
 /* If we aren't compiling with GCC we just provide a minimal
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 1f432cf..e2cf6d9 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2007-03-01  Zdenek Dvorak  <dvorakz@suse.cz>
+
+	* gcc.dg/tree-ssa/prefetch-4.c: New test.
+
 2007-03-01  Simon Baldwin <simonb@google.com>
 
 	PR c++/23689
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c
new file mode 100644
index 0000000..8a5230e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c
@@ -0,0 +1,18 @@
+/* The loop rolls too little, hence the prefetching would not be useful.  */
+
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -fdump-tree-final_cleanup" } */
+
+int xxx[20];
+
+void foo (int n)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    xxx[i] = i;
+}
+
+/* { dg-final { scan-tree-dump-times "prefetch" 0 "final_cleanup" } } */
+/* { dg-final { cleanup-tree-dump "final_cleanup" } } */
diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c
index e0612b9..53977d8 100644
--- a/gcc/tree-ssa-loop-prefetch.c
+++ b/gcc/tree-ssa-loop-prefetch.c
@@ -885,13 +885,14 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
 
 /* Determine the coefficient by that unroll LOOP, from the information
    contained in the list of memory references REFS.  Description of
-   umber of iterations of LOOP is stored to DESC.  AHEAD is the number
-   of iterations ahead that we need to prefetch.  NINSNS is number of
-   insns of the LOOP.  */
+   umber of iterations of LOOP is stored to DESC.  NINSNS is the number of
+   insns of the LOOP.  EST_NITER is the estimated number of iterations of
+   the loop, or -1 if no estimate is available.  */
 
 static unsigned
 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
-			 unsigned ninsns, struct tree_niter_desc *desc)
+			 unsigned ninsns, struct tree_niter_desc *desc,
+			 HOST_WIDE_INT est_niter)
 {
   unsigned upper_bound;
   unsigned nfactor, factor, mod_constraint;
@@ -906,6 +907,12 @@ determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
      gains from better scheduling and decreasing loop overhead, which is not
      the case here.  */
   upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
+
+  /* If we unrolled the loop more times than it iterates, the unrolled version
+     of the loop would be never entered.  */
+  if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
+    upper_bound = est_niter;
+
   if (upper_bound <= 1)
     return 1;
 
@@ -935,7 +942,8 @@ static bool
 loop_prefetch_arrays (struct loop *loop)
 {
   struct mem_ref_group *refs;
-  unsigned ahead, ninsns, unroll_factor;
+  unsigned ahead, ninsns, time, unroll_factor;
+  HOST_WIDE_INT est_niter;
   struct tree_niter_desc desc;
   bool unrolled = false;
 
@@ -950,21 +958,24 @@ loop_prefetch_arrays (struct loop *loop)
 
   /* Step 3: determine the ahead and unroll factor.  */
 
-  /* FIXME: We should use not size of the loop, but the average number of
-     instructions executed per iteration of the loop.  */
-  ninsns = tree_num_loop_insns (loop, &eni_time_weights);
-  ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
-  unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
-  if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
+  /* FIXME: the time should be weighted by the probabilities of the blocks in
+     the loop body.  */
+  time = tree_num_loop_insns (loop, &eni_time_weights);
+  ahead = (PREFETCH_LATENCY + time - 1) / time;
+  est_niter = estimated_loop_iterations_int (loop, false);
 
-  /* If the loop rolls less than the required unroll factor, prefetching
-     is useless.  */
-  if (unroll_factor > 1
-      && cst_and_fits_in_hwi (desc.niter)
-      && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor)
+  /* The prefetches will run for AHEAD iterations of the original loop.  Unless
+     the loop rolls at least AHEAD times, prefetching the references does not
+     make sense.  */
+  if (est_niter >= 0 && est_niter <= (HOST_WIDE_INT) ahead)
     goto fail;
 
+  ninsns = tree_num_loop_insns (loop, &eni_size_weights);
+  unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
+					   est_niter);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
+
   /* Step 4: what to prefetch?  */
   if (!schedule_prefetches (refs, unroll_factor, ahead))
     goto fail;