1 files changed, 312 insertions, 29 deletions
diff --git a/libgomp/loop.c b/libgomp/loop.c
index a8c7e24..4e0683b 100644
--- a/libgomp/loop.c
+++ b/libgomp/loop.c
@@ -27,9 +27,13 @@
 
 #include <limits.h>
 #include <stdlib.h>
+#include <string.h>
 #include "libgomp.h"
 
 
+ialias (GOMP_loop_runtime_next)
+ialias_redirect (GOMP_taskgroup_reduction_register)
+
 /* Initialize the given work share construct from the given arguments.  */
 
 static inline void
@@ -79,12 +83,12 @@ gomp_loop_init (struct gomp_work_share *ws, long start, long end, long incr,
 }
 
 /* The *_start routines are called when first encountering a loop construct
-   that is not bound directly to a parallel construct.  The first thread 
+   that is not bound directly to a parallel construct.  The first thread
    that arrives will create the work-share construct; subsequent threads
    will see the construct exists and allocate work from it.
 
    START, END, INCR are the bounds of the loop; due to the restrictions of
-   OpenMP, these values must be the same in every thread.  This is not 
+   OpenMP, these values must be the same in every thread.  This is not
    verified (nor is it entirely verifiable, since START is not necessarily
    retained intact in the work-share data structure).  CHUNK_SIZE is the
    scheduling parameter; again this must be identical in all threads.
@@ -101,7 +105,7 @@ gomp_loop_static_start (long start, long end, long incr, long chunk_size,
   struct gomp_thread *thr = gomp_thread ();
 
   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_STATIC, chunk_size);
@@ -123,7 +127,7 @@ gomp_loop_dynamic_start (long start, long end, long incr, long chunk_size,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_DYNAMIC, chunk_size);
@@ -151,7 +155,7 @@ gomp_loop_guided_start (long start, long end, long incr, long chunk_size,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_GUIDED, chunk_size);
@@ -174,7 +178,7 @@ GOMP_loop_runtime_start (long start, long end, long incr,
 			 long *istart, long *iend)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
-  switch (icv->run_sched_var)
+  switch (icv->run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_STATIC:
       return gomp_loop_static_start (start, end, incr,
@@ -197,6 +201,100 @@ GOMP_loop_runtime_start (long start, long end, long incr,
     }
 }
 
+static long
+gomp_adjust_sched (long sched, long *chunk_size)
+{
+  sched &= ~GFS_MONOTONIC;
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_DYNAMIC:
+    case GFS_GUIDED:
+      return sched;
+    /* GFS_RUNTIME is used for runtime schedule without monotonic
+       or nonmonotonic modifiers on the clause.
+       GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
+       modifier.  */
+    case GFS_RUNTIME:
+    /* GFS_AUTO is used for runtime schedule with nonmonotonic
+       modifier.  */
+    case GFS_AUTO:
+      {
+	struct gomp_task_icv *icv = gomp_icv (false);
+	sched = icv->run_sched_var & ~GFS_MONOTONIC;
+	switch (sched)
+	  {
+	  case GFS_STATIC:
+	  case GFS_DYNAMIC:
+	  case GFS_GUIDED:
+	    *chunk_size = icv->run_sched_chunk_size;
+	    break;
+	  case GFS_AUTO:
+	    sched = GFS_STATIC;
+	    *chunk_size = 0;
+	    break;
+	  default:
+	    abort ();
+	  }
+	return sched;
+      }
+    default:
+      abort ();
+    }
+}
+
+bool
+GOMP_loop_start (long start, long end, long incr, long sched,
+		 long chunk_size, long *istart, long *iend,
+		 uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_init (thr->ts.work_share, start, end, incr,
+		      sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (mem)
+	{
+	  uintptr_t size = (uintptr_t) *mem;
+	  if (size > (sizeof (struct gomp_work_share)
+		      - offsetof (struct gomp_work_share,
+				  inline_ordered_team_ids)))
+	    thr->ts.work_share->ordered_team_ids
+	      = gomp_malloc_cleared (size);
+	  else
+	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
+	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      if (mem)
+	*mem = (void *) thr->ts.work_share->ordered_team_ids;
+    }
+
+  if (!istart)
+    return true;
+  return ialias_call (GOMP_loop_runtime_next) (istart, iend);
+}
+
 /* The *_ordered_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED section.  */
 
@@ -207,7 +305,7 @@ gomp_loop_ordered_static_start (long start, long end, long incr,
   struct gomp_thread *thr = gomp_thread ();
 
   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_STATIC, chunk_size);
@@ -225,7 +323,7 @@ gomp_loop_ordered_dynamic_start (long start, long end, long incr,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_DYNAMIC, chunk_size);
@@ -250,7 +348,7 @@ gomp_loop_ordered_guided_start (long start, long end, long incr,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_GUIDED, chunk_size);
@@ -273,7 +371,7 @@ GOMP_loop_ordered_runtime_start (long start, long end, long incr,
 				 long *istart, long *iend)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
-  switch (icv->run_sched_var)
+  switch (icv->run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_STATIC:
       return gomp_loop_ordered_static_start (start, end, incr,
@@ -297,6 +395,81 @@ GOMP_loop_ordered_runtime_start (long start, long end, long incr,
     }
 }
 
+bool
+GOMP_loop_ordered_start (long start, long end, long incr, long sched,
+			 long chunk_size, long *istart, long *iend,
+			 uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  size_t ordered = 1;
+  bool ret;
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (mem)
+    ordered += (uintptr_t) *mem;
+  if (gomp_work_share_start (ordered))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_init (thr->ts.work_share, start, end, incr,
+		      sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (sched == GFS_STATIC)
+	gomp_ordered_static_init ();
+      else
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+      if (sched != GFS_STATIC)
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+    }
+
+  if (mem)
+    {
+      uintptr_t p
+	= (uintptr_t) (thr->ts.work_share->ordered_team_ids
+		       + (thr->ts.team ? thr->ts.team->nthreads : 1));
+      p += __alignof__ (long long) - 1;
+      p &= ~(__alignof__ (long long) - 1);
+      *mem = (void *) p;
+    }
+
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_AUTO:
+      return !gomp_iter_static_next (istart, iend);
+    case GFS_DYNAMIC:
+      ret = gomp_iter_dynamic_next_locked (istart, iend);
+      break;
+    case GFS_GUIDED:
+      ret = gomp_iter_guided_next_locked (istart, iend);
+      break;
+    default:
+      abort ();
+    }
+
+  if (ret)
+    gomp_ordered_first ();
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+  return ret;
+}
+
 /* The *_doacross_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
    section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
@@ -310,11 +483,11 @@ gomp_loop_doacross_static_start (unsigned ncounts, long *counts,
   struct gomp_thread *thr = gomp_thread ();
 
   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_STATIC, chunk_size);
-      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }
 
@@ -328,11 +501,11 @@ gomp_loop_doacross_dynamic_start (unsigned ncounts, long *counts,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_DYNAMIC, chunk_size);
-      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }
 
@@ -354,11 +527,11 @@ gomp_loop_doacross_guided_start (unsigned ncounts, long *counts,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_GUIDED, chunk_size);
-      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }
 
@@ -378,7 +551,7 @@ GOMP_loop_doacross_runtime_start (unsigned ncounts, long *counts,
 				  long *istart, long *iend)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
-  switch (icv->run_sched_var)
+  switch (icv->run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_STATIC:
       return gomp_loop_doacross_static_start (ncounts, counts,
@@ -402,8 +575,52 @@ GOMP_loop_doacross_runtime_start (unsigned ncounts, long *counts,
     }
 }
 
-/* The *_next routines are called when the thread completes processing of 
-   the iteration block currently assigned to it.  If the work-share 
+bool
+GOMP_loop_doacross_start (unsigned ncounts, long *counts, long sched,
+			  long chunk_size, long *istart, long *iend,
+			  uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      size_t extra = 0;
+      if (mem)
+	extra = (uintptr_t) *mem;
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
+		      sched, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, extra);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+    }
+
+  if (mem)
+    *mem = thr->ts.work_share->doacross->extra;
+
+  return ialias_call (GOMP_loop_runtime_next) (istart, iend);
+}
+
+/* The *_next routines are called when the thread completes processing of
+   the iteration block currently assigned to it.  If the work-share
    construct is bound directly to a parallel construct, then the iteration
    bounds may have been set up before the parallel.  In which case, this
    may be the first iteration for the thread.
@@ -456,7 +673,7 @@ bool
 GOMP_loop_runtime_next (long *istart, long *iend)
 {
   struct gomp_thread *thr = gomp_thread ();
-  
+
   switch (thr->ts.work_share->sched)
     {
     case GFS_STATIC:
@@ -534,7 +751,7 @@ bool
 GOMP_loop_ordered_runtime_next (long *istart, long *iend)
 {
   struct gomp_thread *thr = gomp_thread ();
-  
+
   switch (thr->ts.work_share->sched)
     {
     case GFS_STATIC:
@@ -563,7 +780,7 @@ gomp_parallel_loop_start (void (*fn) (void *), void *data,
   num_threads = gomp_resolve_num_threads (num_threads, 0);
   team = gomp_new_team (num_threads);
   gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
-  gomp_team_start (fn, data, num_threads, flags, team);
+  gomp_team_start (fn, data, num_threads, flags, team, NULL);
 }
 
 void
@@ -600,7 +817,8 @@ GOMP_parallel_loop_runtime_start (void (*fn) (void *), void *data,
 {
   struct gomp_task_icv *icv = gomp_icv (false);
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    icv->run_sched_var, icv->run_sched_chunk_size, 0);
+			    icv->run_sched_var & ~GFS_MONOTONIC,
+			    icv->run_sched_chunk_size, 0);
 }
 
 ialias_redirect (GOMP_parallel_end)
@@ -638,11 +856,28 @@ GOMP_parallel_loop_guided (void (*fn) (void *), void *data,
   GOMP_parallel_end ();
 }
 
+void
+GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
+			    unsigned num_threads, long start, long end,
+			    long incr, unsigned flags)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    icv->run_sched_var & ~GFS_MONOTONIC,
+			    icv->run_sched_chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+
 #ifdef HAVE_ATTRIBUTE_ALIAS
 extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic
 	__attribute__((alias ("GOMP_parallel_loop_dynamic")));
 extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided
 	__attribute__((alias ("GOMP_parallel_loop_guided")));
+extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_nonmonotonic_runtime
+	__attribute__((alias ("GOMP_parallel_loop_runtime")));
+extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_maybe_nonmonotonic_runtime
+	__attribute__((alias ("GOMP_parallel_loop_runtime")));
 #else
 void
 GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data,
@@ -667,21 +902,35 @@ GOMP_parallel_loop_nonmonotonic_guided (void (*fn) (void *), void *data,
   fn (data);
   GOMP_parallel_end ();
 }
-#endif
 
 void
-GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
-			    unsigned num_threads, long start, long end,
-			    long incr, unsigned flags)
+GOMP_parallel_loop_nonmonotonic_runtime (void (*fn) (void *), void *data,
+					 unsigned num_threads, long start,
+					 long end, long incr, unsigned flags)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    icv->run_sched_var, icv->run_sched_chunk_size,
-			    flags);
+			    icv->run_sched_var & ~GFS_MONOTONIC,
+			    icv->run_sched_chunk_size, flags);
   fn (data);
   GOMP_parallel_end ();
 }
 
+void
+GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*fn) (void *), void *data,
+					       unsigned num_threads, long start,
+					       long end, long incr,
+					       unsigned flags)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    icv->run_sched_var & ~GFS_MONOTONIC,
+			    icv->run_sched_chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+#endif
+
 /* The GOMP_loop_end* routines are called after the thread is told that
    all loop iterations are complete.  The first two versions synchronize
    all threads; the nowait version does not.  */
@@ -721,6 +970,10 @@ extern __typeof(gomp_loop_dynamic_start) GOMP_loop_nonmonotonic_dynamic_start
 	__attribute__((alias ("gomp_loop_dynamic_start")));
 extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start
 	__attribute__((alias ("gomp_loop_guided_start")));
+extern __typeof(GOMP_loop_runtime_start) GOMP_loop_nonmonotonic_runtime_start
+	__attribute__((alias ("GOMP_loop_runtime_start")));
+extern __typeof(GOMP_loop_runtime_start) GOMP_loop_maybe_nonmonotonic_runtime_start
+	__attribute__((alias ("GOMP_loop_runtime_start")));
 
 extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start
 	__attribute__((alias ("gomp_loop_ordered_static_start")));
@@ -746,6 +999,10 @@ extern __typeof(gomp_loop_dynamic_next) GOMP_loop_nonmonotonic_dynamic_next
 	__attribute__((alias ("gomp_loop_dynamic_next")));
 extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next
 	__attribute__((alias ("gomp_loop_guided_next")));
+extern __typeof(GOMP_loop_runtime_next) GOMP_loop_nonmonotonic_runtime_next
+	__attribute__((alias ("GOMP_loop_runtime_next")));
+extern __typeof(GOMP_loop_runtime_next) GOMP_loop_maybe_nonmonotonic_runtime_next
+	__attribute__((alias ("GOMP_loop_runtime_next")));
 
 extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next
 	__attribute__((alias ("gomp_loop_ordered_static_next")));
@@ -791,6 +1048,20 @@ GOMP_loop_nonmonotonic_guided_start (long start, long end, long incr,
 }
 
 bool
+GOMP_loop_nonmonotonic_runtime_start (long start, long end, long incr,
+				      long *istart, long *iend)
+{
+  return GOMP_loop_runtime_start (start, end, incr, istart, iend);
+}
+
+bool
+GOMP_loop_maybe_nonmonotonic_runtime_start (long start, long end, long incr,
+					    long *istart, long *iend)
+{
+  return GOMP_loop_runtime_start (start, end, incr, istart, iend);
+}
+
+bool
 GOMP_loop_ordered_static_start (long start, long end, long incr,
 				long chunk_size, long *istart, long *iend)
 {
@@ -869,6 +1140,18 @@ GOMP_loop_nonmonotonic_guided_next (long *istart, long *iend)
 }
 
 bool
+GOMP_loop_nonmonotonic_runtime_next (long *istart, long *iend)
+{
+  return GOMP_loop_runtime_next (istart, iend);
+}
+
+bool
+GOMP_loop_maybe_nonmonotonic_runtime_next (long *istart, long *iend)
+{
+  return GOMP_loop_runtime_next (istart, iend);
+}
+
+bool
 GOMP_loop_ordered_static_next (long *istart, long *iend)
 {
   return gomp_loop_ordered_static_next (istart, iend);