Linux: don't resume new LWPs until we've pulled all events out of the kernel

Since the starvation avoidance series (https://sourceware.org/ml/gdb-patches/2014-12/msg00631.html), both GDB and GDBserver pull all events out of ptrace before deciding which event to process. There's one problem with that though. Because we resume new threads immediately when we see a PTRACE_EVENT_CLONE event, if the program constantly spawns threads fast enough, new threads can spawn threads faster we can pull events out of the kernel, and thus we'd get stuck in an infinite loop, never returning any event to the core to process. I occasionally see this happen with the attach-many-short-lived-threads.exp test against gdbserver. The fix is to delay resuming new threads until we've pulled out all events out of the kernel. On native, we already have the resume_stopped_resumed_lwps function that knows to resume LWPs that are stopped with no event to report to the core. So the patch just adds another use. GDBserver didn't have the equivalent yet, so the patch adds one. Tested on x86_64 Fedora 20, native and gdbserver (remote and extended-remote). gdb/gdbserver/ChangeLog: 2015-02-04 Pedro Alves <palves@redhat.com> * linux-low.c (handle_extended_wait): Don't resume LWPs here. (resume_stopped_resumed_lwps): New function. (linux_wait_for_event_filtered): Use it. gdb/ChangeLog: 2015-02-04 Pedro Alves <palves@redhat.com> * linux-nat.c (handle_extended_wait): Don't resume LWPs here. (wait_lwp): Don't call wait_lwp if linux_handle_extended_wait returns true. (resume_stopped_resumed_lwps): Don't check whether the thread is marked as executing. (linux_nat_wait_1): Use resume_stopped_resumed_lwps.
author: Pedro Alves <palves@redhat.com> 2015-02-04 19:13:28 +0100
committer: Pedro Alves <palves@redhat.com> 2015-02-04 19:13:28 +0100
commit: 20ba1ce66d31b9dd16ed8c648f46ce32aa3a03e0 (patch)
tree: 2f33431e75790cfddfac5f6ef30f175f826f9af1
parent: 3c537f7fdb11f02f7082749f3f21dfdd2c2025e8 (diff)
download: fsf-binutils-gdb-20ba1ce66d31b9dd16ed8c648f46ce32aa3a03e0.zip
fsf-binutils-gdb-20ba1ce66d31b9dd16ed8c648f46ce32aa3a03e0.tar.gz
fsf-binutils-gdb-20ba1ce66d31b9dd16ed8c648f46ce32aa3a03e0.tar.bz2
4 files changed, 63 insertions, 53 deletions
diff --git a/gdb/ChangeLog b/gdb/ChangeLog
index 6250534..aae6a7c 100644
--- a/gdb/ChangeLog
+++ b/gdb/ChangeLog
@@ -1,3 +1,12 @@
+2015-02-04  Pedro Alves  <palves@redhat.com>
+
+	* linux-nat.c (handle_extended_wait): Don't resume LWPs here.
+	(wait_lwp): Don't call wait_lwp if linux_handle_extended_wait
+	returns true.
+	(resume_stopped_resumed_lwps): Don't check whether the thread is
+	marked as executing.
+	(linux_nat_wait_1): Use resume_stopped_resumed_lwps.
+
 2015-02-04  Andreas Arnez  <arnez@linux.vnet.ibm.com>
 
 	* regset.h (struct regset): Add flags field.
diff --git a/gdb/gdbserver/ChangeLog b/gdb/gdbserver/ChangeLog
index 51f1019..f55add2 100644
--- a/gdb/gdbserver/ChangeLog
+++ b/gdb/gdbserver/ChangeLog
@@ -1,3 +1,9 @@
+2015-02-04  Pedro Alves  <palves@redhat.com>
+
+	* linux-low.c (handle_extended_wait): Don't resume LWPs here.
+	(resume_stopped_resumed_lwps): New function.
+	(linux_wait_for_event_filtered): Use it.
+
 2015-01-15  Sergio Durigan Junior  <sergiodj@redhat.com>
 
 	* Makefile.in (SFILES): Add linux-personality.c.
diff --git a/gdb/gdbserver/linux-low.c b/gdb/gdbserver/linux-low.c
index 5e37dd5..1682679 100644
--- a/gdb/gdbserver/linux-low.c
+++ b/gdb/gdbserver/linux-low.c
@@ -427,30 +427,12 @@ handle_extended_wait (struct lwp_info *event_child, int wstat)
       /* Normally we will get the pending SIGSTOP.  But in some cases
 	 we might get another signal delivered to the group first.
 	 If we do get another signal, be sure not to lose it.  */
-      if (WSTOPSIG (status) == SIGSTOP)
-	{
-	  if (stopping_threads == NOT_STOPPING_THREADS)
-	    linux_resume_one_lwp (new_lwp, 0, 0, NULL);
-	}
-      else
+      if (WSTOPSIG (status) != SIGSTOP)
 	{
 	  new_lwp->stop_expected = 1;
-
-	  if (stopping_threads != NOT_STOPPING_THREADS)
-	    {
-	      new_lwp->status_pending_p = 1;
-	      new_lwp->status_pending = status;
-	    }
-	  else
-	    /* Pass the signal on.  This is what GDB does - except
-	       shouldn't we really report it instead?  */
-	    linux_resume_one_lwp (new_lwp, 0, WSTOPSIG (status), NULL);
+	  new_lwp->status_pending_p = 1;
+	  new_lwp->status_pending = status;
 	}
-
-      /* Always resume the current thread.  If we are stopping
-	 threads, it will have a pending SIGSTOP; we may as well
-	 collect it now.  */
-      linux_resume_one_lwp (event_child, event_child->stepping, 0, NULL);
     }
 }
 
@@ -1955,6 +1937,32 @@ linux_low_filter_event (int lwpid, int wstat)
   return child;
 }
 
+/* Resume LWPs that are currently stopped without any pending status
+   to report, but are resumed from the core's perspective.  */
+
+static void
+resume_stopped_resumed_lwps (struct inferior_list_entry *entry)
+{
+  struct thread_info *thread = (struct thread_info *) entry;
+  struct lwp_info *lp = get_thread_lwp (thread);
+
+  if (lp->stopped
+      && !lp->status_pending_p
+      && thread->last_resume_kind != resume_stop
+      && thread->last_status.kind == TARGET_WAITKIND_IGNORE)
+    {
+      int step = thread->last_resume_kind == resume_step;
+
+      if (debug_threads)
+	debug_printf ("RSRL: resuming stopped-resumed LWP %s at %s: step=%d\n",
+		      target_pid_to_str (ptid_of (thread)),
+		      paddress (lp->stop_pc),
+		      step);
+
+      linux_resume_one_lwp (lp, step, GDB_SIGNAL_0, NULL);
+    }
+}
+
 /* Wait for an event from child(ren) WAIT_PTID, and return any that
    match FILTER_PTID (leaving others pending).  The PTIDs can be:
    minus_one_ptid, to specify any child; a pid PTID, specifying all
@@ -2088,8 +2096,13 @@ linux_wait_for_event_filtered (ptid_t wait_ptid, ptid_t filter_ptid,
 	  continue;
 	}
 
-      /* Now that we've pulled all events out of the kernel, check if
-	 there's any LWP with a status to report to the core.  */
+      /* Now that we've pulled all events out of the kernel, resume
+	 LWPs that don't have an interesting event to report.  */
+      if (stopping_threads == NOT_STOPPING_THREADS)
+	for_each_inferior (&all_threads, resume_stopped_resumed_lwps);
+
+      /* ... and find an LWP with a status to report to the core, if
+	 any.  */
       event_thread = (struct thread_info *)
 	find_inferior (&all_threads, status_pending_p_callback, &filter_ptid);
       if (event_thread != NULL)
diff --git a/gdb/linux-nat.c b/gdb/linux-nat.c
index b4893d44..169188a 100644
--- a/gdb/linux-nat.c
+++ b/gdb/linux-nat.c
@@ -691,6 +691,7 @@ linux_nat_pass_signals (struct target_ops *self,
 static int stop_wait_callback (struct lwp_info *lp, void *data);
 static int linux_thread_alive (ptid_t ptid);
 static char *linux_child_pid_to_exec_file (struct target_ops *self, int pid);
+static int resume_stopped_resumed_lwps (struct lwp_info *lp, void *data);
 
 
 
@@ -2033,28 +2034,7 @@ linux_handle_extended_wait (struct lwp_info *lp, int status,
 	      new_lp->status = status;
 	    }
 
-	  /* Note the need to use the low target ops to resume, to
-	     handle resuming with PT_SYSCALL if we have syscall
-	     catchpoints.  */
-	  if (!stopping)
-	    {
-	      new_lp->resumed = 1;
-
-	      if (status == 0)
-		{
-		  gdb_assert (new_lp->last_resume_kind == resume_continue);
-		  if (debug_linux_nat)
-		    fprintf_unfiltered (gdb_stdlog,
-					"LHEW: resuming new LWP %ld\n",
-					ptid_get_lwp (new_lp->ptid));
-		  linux_resume_one_lwp (new_lp, 0, GDB_SIGNAL_0);
-		}
-	    }
-
-	  if (debug_linux_nat)
-	    fprintf_unfiltered (gdb_stdlog,
-				"LHEW: resuming parent LWP %d\n", pid);
-	  linux_resume_one_lwp (lp, 0, GDB_SIGNAL_0);
+	  new_lp->resumed = !stopping;
 	  return 1;
 	}
 
@@ -2096,9 +2076,8 @@ linux_handle_extended_wait (struct lwp_info *lp, int status,
       if (debug_linux_nat)
 	fprintf_unfiltered (gdb_stdlog,
 			    "LHEW: Got PTRACE_EVENT_VFORK_DONE "
-			    "from LWP %ld: resuming\n",
+			    "from LWP %ld: ignoring\n",
 			    ptid_get_lwp (lp->ptid));
-      ptrace (PTRACE_CONT, ptid_get_lwp (lp->ptid), 0, 0);
       return 1;
     }
 
@@ -2245,8 +2224,8 @@ wait_lwp (struct lwp_info *lp)
 	fprintf_unfiltered (gdb_stdlog,
 			    "WL: Handling extended status 0x%06x\n",
 			    status);
-      if (linux_handle_extended_wait (lp, status, 1))
-	return wait_lwp (lp);
+      linux_handle_extended_wait (lp, status, 1);
+      return 0;
     }
 
   return status;
@@ -3274,8 +3253,13 @@ linux_nat_wait_1 (struct target_ops *ops,
 	  continue;
 	}
 
-      /* Now that we've pulled all events out of the kernel, check if
-	 there's any LWP with a status to report to the core.  */
+      /* Now that we've pulled all events out of the kernel, resume
+	 LWPs that don't have an interesting event to report.  */
+      iterate_over_lwps (minus_one_ptid,
+			 resume_stopped_resumed_lwps, &minus_one_ptid);
+
+      /* ... and find an LWP with a status to report to the core, if
+	 any.  */
       lp = iterate_over_lwps (ptid, status_callback, NULL);
       if (lp != NULL)
 	break;
@@ -3436,8 +3420,6 @@ resume_stopped_resumed_lwps (struct lwp_info *lp, void *data)
       struct gdbarch *gdbarch = get_regcache_arch (regcache);
       CORE_ADDR pc = regcache_read_pc (regcache);
 
-      gdb_assert (is_executing (lp->ptid));
-
       /* Don't bother if there's a breakpoint at PC that we'd hit
 	 immediately, and we're not waiting for this LWP.  */
       if (!ptid_match (lp->ptid, *wait_ptid_p))
author	Pedro Alves <palves@redhat.com>	2015-02-04 19:13:28 +0100
committer	Pedro Alves <palves@redhat.com>	2015-02-04 19:13:28 +0100
commit	20ba1ce66d31b9dd16ed8c648f46ce32aa3a03e0 (patch)
tree	2f33431e75790cfddfac5f6ef30f175f826f9af1
parent	3c537f7fdb11f02f7082749f3f21dfdd2c2025e8 (diff)
download	fsf-binutils-gdb-20ba1ce66d31b9dd16ed8c648f46ce32aa3a03e0.zip fsf-binutils-gdb-20ba1ce66d31b9dd16ed8c648f46ce32aa3a03e0.tar.gz fsf-binutils-gdb-20ba1ce66d31b9dd16ed8c648f46ce32aa3a03e0.tar.bz2