New ThreadPlanSingleThreadTimeout to resolve potential deadlock in single thread stepping (#90930)

This PR introduces a new `ThreadPlanSingleThreadTimeout` that will be used to address potential deadlock during single-thread stepping. While debugging a target with a non-trivial number of threads (around 5000 threads in one example target), we noticed that a simple step over can take as long as 10 seconds. Enabling single-thread stepping mode significantly reduces the stepping time to around 3 seconds. However, this can introduce deadlock if we try to step over a method that depends on other threads to release a lock. To address this issue, we introduce a new `ThreadPlanSingleThreadTimeout` that can be controlled by the `target.process.thread.single-thread-plan-timeout` setting during single-thread stepping mode. The concept involves counting the elapsed time since the last internal stop to detect overall stepping progress. Once a timeout occurs, we assume the target is not making progress due to a potential deadlock, as mentioned above. We then send a new async interrupt, resume all threads, and `ThreadPlanSingleThreadTimeout` completes its task. To support this design, the major changes made in this PR are: 1. `ThreadPlanSingleThreadTimeout` is popped during every internal stop and reset (re-pushed) to the top of the stack (as a leaf node) during resume. This is achieved by always returning `true` from `ThreadPlanSingleThreadTimeout::DoPlanExplainsStop()` and `ThreadPlanSingleThreadTimeout::MischiefManaged()`. 2. A new thread-specific async interrupt stop is introduced, which can be detected/consumed by `ThreadPlanSingleThreadTimeout`. 3. The clearing of branch breakpoints in the range thread plan has been moved from `DoPlanExplainsStop()` to `ShouldStop()`, as it is not guaranteed that it will be called. The detailed design is discussed in the RFC below: [https://discourse.llvm.org/t/improve-single-thread-stepping/74599](https://discourse.llvm.org/t/improve-single-thread-stepping/74599) --------- Co-authored-by: jeffreytan81 <jeffreytan@fb.com>
author: jeffreytan81 <jeffreytan@meta.com> 2024-08-05 17:26:39 -0700
committer: GitHub <noreply@github.com> 2024-08-05 17:26:39 -0700
commit: f838fa820f9271008617c345c477122d9e29a05c (patch)
tree: 17e83bb61d8354b9502fa2e0935d7a9360bf7892 /lldb/source/Target/ThreadPlanStepRange.cpp
parent: 84cc1865ef9202af39404ff4524a9b13df80cfc1 (diff)
download: llvm-f838fa820f9271008617c345c477122d9e29a05c.zip
llvm-f838fa820f9271008617c345c477122d9e29a05c.tar.gz
llvm-f838fa820f9271008617c345c477122d9e29a05c.tar.bz2
1 files changed, 45 insertions, 26 deletions
diff --git a/lldb/source/Target/ThreadPlanStepRange.cpp b/lldb/source/Target/ThreadPlanStepRange.cpp
index 801856b..3c82505 100644
--- a/lldb/source/Target/ThreadPlanStepRange.cpp
+++ b/lldb/source/Target/ThreadPlanStepRange.cpp
@@ -293,6 +293,20 @@ InstructionList *ThreadPlanStepRange::GetInstructionsForAddress(
   return nullptr;
 }
 
+bool ThreadPlanStepRange::IsNextBranchBreakpointStop(StopInfoSP stop_info_sp) {
+  if (!m_next_branch_bp_sp)
+    return false;
+
+  break_id_t bp_site_id = stop_info_sp->GetValue();
+  BreakpointSiteSP bp_site_sp =
+      m_process.GetBreakpointSiteList().FindByID(bp_site_id);
+  if (!bp_site_sp)
+    return false;
+  else if (!bp_site_sp->IsBreakpointAtThisSite(m_next_branch_bp_sp->GetID()))
+    return false;
+  return true;
+}
+
 void ThreadPlanStepRange::ClearNextBranchBreakpoint() {
   if (m_next_branch_bp_sp) {
     Log *log = GetLog(LLDBLog::Step);
@@ -305,6 +319,11 @@ void ThreadPlanStepRange::ClearNextBranchBreakpoint() {
   }
 }
 
+void ThreadPlanStepRange::ClearNextBranchBreakpointExplainedStop() {
+  if (IsNextBranchBreakpointStop(GetPrivateStopInfo()))
+    ClearNextBranchBreakpoint();
+}
+
 bool ThreadPlanStepRange::SetNextBranchBreakpoint() {
   if (m_next_branch_bp_sp)
     return true;
@@ -347,7 +366,9 @@ bool ThreadPlanStepRange::SetNextBranchBreakpoint() {
       run_to_address =
           instructions->GetInstructionAtIndex(branch_index)->GetAddress();
     }
-
+    if (branch_index == pc_index)
+      LLDB_LOGF(log, "ThreadPlanStepRange::SetNextBranchBreakpoint - skipping "
+                     "because current is branch instruction");
     if (run_to_address.IsValid()) {
       const bool is_internal = true;
       m_next_branch_bp_sp =
@@ -381,15 +402,16 @@ bool ThreadPlanStepRange::SetNextBranchBreakpoint() {
         return true;
       } else
         return false;
-    }
+    } else
+      LLDB_LOGF(log, "ThreadPlanStepRange::SetNextBranchBreakpoint - skipping "
+                     "invalid run_to_address");
   }
   return false;
 }
 
 bool ThreadPlanStepRange::NextRangeBreakpointExplainsStop(
     lldb::StopInfoSP stop_info_sp) {
-  Log *log = GetLog(LLDBLog::Step);
-  if (!m_next_branch_bp_sp)
+  if (!IsNextBranchBreakpointStop(stop_info_sp))
     return false;
 
   break_id_t bp_site_id = stop_info_sp->GetValue();
@@ -397,30 +419,27 @@ bool ThreadPlanStepRange::NextRangeBreakpointExplainsStop(
       m_process.GetBreakpointSiteList().FindByID(bp_site_id);
   if (!bp_site_sp)
     return false;
-  else if (!bp_site_sp->IsBreakpointAtThisSite(m_next_branch_bp_sp->GetID()))
-    return false;
-  else {
-    // If we've hit the next branch breakpoint, then clear it.
-    size_t num_constituents = bp_site_sp->GetNumberOfConstituents();
-    bool explains_stop = true;
-    // If all the constituents are internal, then we are probably just stepping
-    // over this range from multiple threads, or multiple frames, so we want to
-    // continue.  If one is not internal, then we should not explain the stop,
-    // and let the user breakpoint handle the stop.
-    for (size_t i = 0; i < num_constituents; i++) {
-      if (!bp_site_sp->GetConstituentAtIndex(i)->GetBreakpoint().IsInternal()) {
-        explains_stop = false;
-        break;
-      }
+
+  // If we've hit the next branch breakpoint, then clear it.
+  size_t num_constituents = bp_site_sp->GetNumberOfConstituents();
+  bool explains_stop = true;
+  // If all the constituents are internal, then we are probably just stepping
+  // over this range from multiple threads, or multiple frames, so we want to
+  // continue.  If one is not internal, then we should not explain the stop,
+  // and let the user breakpoint handle the stop.
+  for (size_t i = 0; i < num_constituents; i++) {
+    if (!bp_site_sp->GetConstituentAtIndex(i)->GetBreakpoint().IsInternal()) {
+      explains_stop = false;
+      break;
     }
-    LLDB_LOGF(log,
-              "ThreadPlanStepRange::NextRangeBreakpointExplainsStop - Hit "
-              "next range breakpoint which has %" PRIu64
-              " constituents - explains stop: %u.",
-              (uint64_t)num_constituents, explains_stop);
-    ClearNextBranchBreakpoint();
-    return explains_stop;
   }
+  Log *log = GetLog(LLDBLog::Step);
+  LLDB_LOGF(log,
+            "ThreadPlanStepRange::NextRangeBreakpointExplainsStop - Hit "
+            "next range breakpoint which has %" PRIu64
+            " constituents - explains stop: %u.",
+            (uint64_t)num_constituents, explains_stop);
+  return explains_stop;
 }
 
 bool ThreadPlanStepRange::WillStop() { return true; }
author	jeffreytan81 <jeffreytan@meta.com>	2024-08-05 17:26:39 -0700
committer	GitHub <noreply@github.com>	2024-08-05 17:26:39 -0700
commit	f838fa820f9271008617c345c477122d9e29a05c (patch)
tree	17e83bb61d8354b9502fa2e0935d7a9360bf7892 /lldb/source/Target/ThreadPlanStepRange.cpp
parent	84cc1865ef9202af39404ff4524a9b13df80cfc1 (diff)
download	llvm-f838fa820f9271008617c345c477122d9e29a05c.zip llvm-f838fa820f9271008617c345c477122d9e29a05c.tar.gz llvm-f838fa820f9271008617c345c477122d9e29a05c.tar.bz2