You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2018/02/21 23:55:29 UTC

[1/2] kudu git commit: KUDU-2291 (part 5): allow collecting stack traces asynchronously

Repository: kudu
Updated Branches:
  refs/heads/master 50931e291 -> 0ee40b661


KUDU-2291 (part 5): allow collecting stack traces asynchronously

This removes the global variable previously used to communicate between
a stack trace collector and the stack trace target. The global variable
had the downside of allowing only a single stack trace operation in
flight at a time, which means that for use cases where we want to
capture the trace of a bunch of threads, we needed to do so serially.

For use cases like process-wide traces this could make debugging a bit
tricky. For example, because of the skew in time between collecting
the stack of thread A and the stack of thread B, it's possible that
the traces could show them holding the same lock, which we know to be
impossible. Getting the collection time as close as possible to
"simultaneous" can reduce the possibility of these kind of
strange-looking results.

The actual mechanism to do this is slightly tricky. Rather than
duplicate the full description here, check the comments in the
implementation. In particular, StackTraceCollector::RevokeSigData()
explains the complexity of not knowing whether our signal will ever be
delivered.

A new unit test shows the pattern, and this also updates the stack
servlet to send the signals to all of the threads as close as possible
to "the same time".

Change-Id: Ie4d7789bb1272db033df4685f46eaf8483ec7be1
Reviewed-on: http://gerrit.cloudera.org:8080/9318
Reviewed-by: Mike Percy <mp...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/c20b84bb
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/c20b84bb
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/c20b84bb

Branch: refs/heads/master
Commit: c20b84bb600575e4f6a34846c07acf4603b1bf26
Parents: 50931e2
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Feb 7 16:34:42 2018 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Feb 21 23:54:49 2018 +0000

----------------------------------------------------------------------
 src/kudu/server/default_path_handlers.cc |  48 +++--
 src/kudu/util/debug-util-test.cc         |  65 ++++++-
 src/kudu/util/debug-util.cc              | 268 +++++++++++++++++---------
 src/kudu/util/debug-util.h               |  45 +++++
 4 files changed, 317 insertions(+), 109 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/c20b84bb/src/kudu/server/default_path_handlers.cc
----------------------------------------------------------------------
diff --git a/src/kudu/server/default_path_handlers.cc b/src/kudu/server/default_path_handlers.cc
index 3ecc0fe..b857be5 100644
--- a/src/kudu/server/default_path_handlers.cc
+++ b/src/kudu/server/default_path_handlers.cc
@@ -157,6 +157,7 @@ static void FlagsHandler(const Webserver::WebRequest& req,
 // Prints out the current stack trace of all threads in the process.
 static void StacksHandler(const Webserver::WebRequest& /*req*/,
                           Webserver::PrerenderedWebResponse* resp) {
+  MonoTime start = MonoTime::Now();
   std::ostringstream* output = resp->output;
   vector<pid_t> tids;
   Status s = ListThreads(&tids);
@@ -168,22 +169,19 @@ static void StacksHandler(const Webserver::WebRequest& /*req*/,
     pid_t tid;
     Status status;
     string thread_name;
+    StackTraceCollector stc;
     StackTrace stack;
   };
-  std::multimap<string, Info> grouped_infos;
-  vector<Info> failed;
-
-  // Capture all the stacks without symbolization initially so that
-  // the stack traces come from as close together in time as possible.
-  //
-  // TODO(todd): would be good to actually send the dump signal to all
-  // threads and then wait for them all to collect their traces, to get
-  // an even tighter snapshot.
-  MonoTime start = MonoTime::Now();
+
+  // Initially trigger all the stack traces.
+  vector<Info> infos(tids.size());
   for (int i = 0; i < tids.size(); i++) {
-    Info info;
-    info.tid = tids[i];
+    infos[i].tid = tids[i];
+    infos[i].status = infos[i].stc.TriggerAsync(tids[i], &infos[i].stack);
+  }
 
+  // Now collect the thread names while we are waiting on stack trace collection.
+  for (auto& info : infos) {
     // Get the thread's name by reading proc.
     // TODO(todd): should we have the dumped thread fill in its own name using
     // prctl to avoid having to open and read /proc? Or maybe we should use the
@@ -199,26 +197,38 @@ static void StacksHandler(const Webserver::WebRequest& /*req*/,
       info.thread_name = buf.ToString();
       StripTrailingNewline(&info.thread_name);
     }
+  }
+
+  // Now actually collect all the stacks.
+  MonoTime deadline = MonoTime::Now() + MonoDelta::FromSeconds(1);
+  for (auto& info : infos) {
+    info.status = info.status.AndThen([&] {
+        return info.stc.AwaitCollection(deadline);
+      });
+  }
 
-    info.status = GetThreadStack(info.tid, &info.stack);
+  // And group the threads by their stack trace.
+  std::multimap<string, Info*> grouped_infos;
+  int num_failed = 0;
+  for (auto& info : infos) {
     if (info.status.ok()) {
-      grouped_infos.emplace(info.stack.ToHexString(), std::move(info));
+      grouped_infos.emplace(info.stack.ToHexString(), &info);
     } else {
-      failed.emplace_back(std::move(info));
+      num_failed++;
     }
   }
   MonoDelta dur = MonoTime::Now() - start;
 
   *output << "Collected stacks from " << grouped_infos.size() << " threads in "
           << dur.ToString() << "\n";
-  if (!failed.empty()) {
-    *output << "Failed to collect stacks from " << failed.size() << " threads "
+  if (num_failed) {
+    *output << "Failed to collect stacks from " << num_failed << " threads "
             << "(they may have exited while we were iterating over the threads)\n";
   }
   *output << "\n";
   for (auto it = grouped_infos.begin(); it != grouped_infos.end();) {
     auto end_group = grouped_infos.equal_range(it->first).second;
-    const auto& stack = it->second.stack;
+    const auto& stack = it->second->stack;
     int num_in_group = std::distance(it, end_group);
     if (num_in_group > 1) {
       *output << num_in_group << " threads with same stack:\n";
@@ -226,7 +236,7 @@ static void StacksHandler(const Webserver::WebRequest& /*req*/,
 
     while (it != end_group) {
       const auto& info = it->second;
-      *output << "TID " << info.tid << "(" << info.thread_name << "):\n";
+      *output << "TID " << info->tid << "(" << info->thread_name << "):\n";
       ++it;
     }
     *output << stack.Symbolize() << "\n\n";

http://git-wip-us.apache.org/repos/asf/kudu/blob/c20b84bb/src/kudu/util/debug-util-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util-test.cc b/src/kudu/util/debug-util-test.cc
index 88de5ce..abd7c4f 100644
--- a/src/kudu/util/debug-util-test.cc
+++ b/src/kudu/util/debug-util-test.cc
@@ -23,6 +23,7 @@
 
 #include <csignal>
 #include <cstddef>
+#include <memory>
 #include <ostream>
 #include <string>
 #include <vector>
@@ -149,10 +150,49 @@ TEST_F(DebugUtilTest, TestSignalStackTrace) {
 // We don't validate the results in any way -- but this verifies that we can
 // dump library threads such as the libc timer_thread and properly time out.
 TEST_F(DebugUtilTest, TestDumpAllThreads) {
+  // Start a bunch of sleeping threads.
+  const int kNumThreads = 30;
+  CountDownLatch l(1);
+  vector<scoped_refptr<Thread>> threads(kNumThreads);
+  for (int i = 0; i < kNumThreads; i++) {
+    ASSERT_OK(Thread::Create("test", "test thread", &SleeperThread, &l, &threads[i]));
+  }
+
+  SCOPED_CLEANUP({
+      // Allow the thread to finish.
+      l.CountDown();
+      for (auto& t : threads) {
+        t->Join();
+      }
+    });
+
+  // Trigger all of the stack traces.
   vector<pid_t> tids;
   ASSERT_OK(ListThreads(&tids));
-  for (pid_t tid : tids) {
-    LOG(INFO) << DumpThreadStack(tid);
+  vector<StackTraceCollector> collectors(tids.size());
+  vector<StackTrace> traces(tids.size());
+  vector<Status> status(tids.size());
+
+  for (int i = 0; i < tids.size(); i++) {
+    status[i] = collectors[i].TriggerAsync(tids[i], &traces[i]);
+  }
+
+  // Collect them all.
+  MonoTime deadline;
+  #ifdef THREAD_SANITIZER
+  // TSAN runs its own separate background thread which blocks all signals and
+  // thus will cause a timeout here.
+  deadline = MonoTime::Now() + MonoDelta::FromSeconds(3);
+  #else
+  // In normal builds we can expect to get a response from all threads.
+  deadline = MonoTime::Max();
+  #endif
+  for (int i = 0; i < tids.size(); i++) {
+    status[i] = status[i].AndThen([&] {
+        return collectors[i].AwaitCollection(deadline);
+      });
+    LOG(INFO) << "Thread " << tids[i] << ": " << status[i].ToString()
+              << ": " << traces[i].ToHexString();
   }
 }
 
@@ -297,5 +337,26 @@ TEST_P(RaceTest, TestStackTraceRaces) {
   }
 }
 
+void BlockSignalsThread() {
+  sigset_t set;
+  sigemptyset(&set);
+  sigaddset(&set, SIGUSR2);
+  for (int i = 0; i < 3; i++) {
+    CHECK_ERR(pthread_sigmask((i % 2) ? SIG_UNBLOCK : SIG_BLOCK, &set, nullptr));
+    SleepFor(MonoDelta::FromSeconds(1));
+  }
+}
+
+TEST_F(DebugUtilTest, TestThreadBlockingSignals) {
+  scoped_refptr<Thread> t;
+  ASSERT_OK(Thread::Create("test", "test thread", &BlockSignalsThread, &t));
+  SCOPED_CLEANUP({ t->Join(); });
+  string ret;
+  while (ret.find("unable to deliver signal") == string::npos) {
+    ret = DumpThreadStack(t->tid());
+    LOG(INFO) << ret;
+  }
+}
+
 #endif
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/c20b84bb/src/kudu/util/debug-util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util.cc b/src/kudu/util/debug-util.cc
index 17cb940..0e1130b 100644
--- a/src/kudu/util/debug-util.cc
+++ b/src/kudu/util/debug-util.cc
@@ -18,7 +18,9 @@
 #include "kudu/util/debug-util.h"
 
 #include <dirent.h>
+#ifndef __linux__
 #include <sched.h>
+#endif
 #ifdef __linux__
 #include <syscall.h>
 #else
@@ -31,12 +33,13 @@
 #include <climits>
 #include <csignal>
 #include <ctime>
+#include <memory>
 #include <ostream>
 #include <string>
 
 #include <glog/logging.h>
 
-#include "kudu/gutil/atomicops.h"
+#include "kudu/gutil/dynamic_annotations.h"
 #include "kudu/gutil/hash/city.h"
 #include "kudu/gutil/linux_syscall_support.h"
 #include "kudu/gutil/macros.h"
@@ -44,6 +47,7 @@
 #include "kudu/gutil/stringprintf.h"
 #include "kudu/gutil/strings/numbers.h"
 #include "kudu/gutil/strings/substitute.h"
+#include "kudu/util/debug/leak_annotations.h"
 #include "kudu/util/debug/sanitizer_scopes.h"
 #include "kudu/util/debug/unwind_safeness.h"
 #include "kudu/util/errno.h"
@@ -51,6 +55,7 @@
 #include "kudu/util/thread.h"
 
 using std::string;
+using std::unique_ptr;
 using std::vector;
 
 #if defined(__APPLE__)
@@ -97,11 +102,9 @@ static const int kPrintfPointerFieldWidth = 2 + 2 * sizeof(void*);
 // This can't be in used by other libraries in the process.
 static int g_stack_trace_signum = SIGUSR2;
 
-// We only allow a single dumper thread to run at a time. This simplifies the synchronization
-// between the dumper and the target thread.
-//
-// This lock also protects changes to the signal handler.
-static base::SpinLock g_dumper_thread_lock(base::LINKER_INITIALIZED);
+// Protects g_stack_trace_signum and the installation of the signal
+// handler.
+static base::SpinLock g_signal_handler_lock(base::LINKER_INITIALIZED);
 
 namespace kudu {
 
@@ -123,8 +126,7 @@ void TryFlushCoverage() {
 }
 
 
-
-namespace {
+namespace stack_trace_internal {
 
 // Simple notification mechanism based on futex.
 //
@@ -136,6 +138,8 @@ namespace {
 // need strict timeouts here.
 class CompletionFlag {
  public:
+
+  // Mark the flag as complete, waking all waiters.
   void Signal() {
     complete_ = true;
 #ifndef __APPLE__
@@ -146,11 +150,12 @@ class CompletionFlag {
 #endif
   }
 
-  bool TimedWait(MonoDelta timeout) {
+  // Wait for the flag to be marked as complete, up until the given deadline.
+  // Returns true if the flag was marked complete before the deadline.
+  bool WaitUntil(MonoTime deadline) {
     if (complete_) return true;
 
     MonoTime now = MonoTime::Now();
-    MonoTime deadline = now + timeout;
     while (now < deadline) {
 #ifndef __APPLE__
       MonoDelta rem = deadline - now;
@@ -182,61 +187,53 @@ class CompletionFlag {
   std::atomic<int32_t> complete_ { 0 };
 };
 
-// Global structure used to communicate between the signal handler
-// and a dumping thread.
-struct SignalCommunication {
-  // The actual stack trace collected from the target thread.
-  StackTrace stack;
 
-  // The current target. Signals can be delivered asynchronously, so the
-  // dumper thread sets this variable first before sending a signal. If
-  // a signal is received on a thread that doesn't match 'target_tid', it is
-  // ignored.
-  pid_t target_tid;
+// A pointer to this structure is passed as signal data to a thread when
+// a stack trace is being remotely requested.
+struct SignalData {
+  // The actual destination for the stack trace collected from the target thread.
+  StackTrace* stack;
+
+  static const int kNotInUse = 0;
+  static const int kDumpInProgress = -1;
+  // Either one of the above constants, or if the dumper thread
+  // is waiting on a response, the tid that it is waiting on.
+  std::atomic<int64_t> queued_to_tid { kNotInUse };
 
   // Signaled when the target thread has successfully collected its stack.
   // The dumper thread waits for this to become true.
   CompletionFlag result_ready;
+};
 
-  // Lock protecting the other members. We use a bare atomic here and a custom
-  // lock guard below instead of existing spinlock implementaitons because futex()
-  // is not signal-safe.
-  Atomic32 lock;
+} // namespace stack_trace_internal
 
-  struct Lock;
-};
-SignalCommunication g_comm;
+using stack_trace_internal::SignalData;
 
-// Pared-down SpinLock for SignalCommunication::lock. This doesn't rely on futex
-// so it is async-signal safe.
-struct SignalCommunication::Lock {
-  Lock() {
-    while (base::subtle::Acquire_CompareAndSwap(&g_comm.lock, 0, 1) != 0) {
-      sched_yield();
-    }
-  }
-  ~Lock() {
-    base::subtle::Release_Store(&g_comm.lock, 0);
-  }
-};
+namespace {
 
 // Signal handler for our stack trace signal.
 // We expect that the signal is only sent from DumpThreadStack() -- not by a user.
-void HandleStackTraceSignal(int signum) {
-  SignalCommunication::Lock l;
-
-  // Check that the dumper thread is still interested in our stack trace.
-  // It's possible for signal delivery to be artificially delayed, in which
-  // case the dumper thread would have already timed out and moved on with
-  // its life. In that case, we don't want to race with some other thread's
-  // dump.
-  int64_t my_tid = Thread::CurrentThreadId();
-  if (g_comm.target_tid != my_tid) {
+void HandleStackTraceSignal(int /*signum*/, siginfo_t* info, void* /*ucontext*/) {
+  auto* sig_data = reinterpret_cast<SignalData*>(info->si_ptr);
+  DCHECK(sig_data);
+  if (!sig_data) {
+    // Maybe the signal was sent by a user instead of by ourself, ignore it.
     return;
   }
+  ANNOTATE_HAPPENS_AFTER(sig_data);
+  int64_t my_tid = Thread::CurrentThreadId();
 
-  g_comm.stack.Collect(2);
-  g_comm.result_ready.Signal();
+  // If we were slow to process the signal, the sender may have given up and
+  // no longer wants our stack trace. In that case, the 'sig' object will
+  // no longer contain our thread.
+  if (!sig_data->queued_to_tid.compare_exchange_strong(my_tid, SignalData::kDumpInProgress)) {
+    return;
+  }
+  // Marking it as kDumpInProgress ensures that the caller thread must now wait
+  // for our response, since we are writing directly into their StackTrace object.
+  sig_data->stack->Collect(/*skip_frames=*/2);
+  sig_data->queued_to_tid = SignalData::kNotInUse;
+  sig_data->result_ready.Signal();
 }
 
 bool InitSignalHandlerUnlocked(int signum) {
@@ -252,7 +249,7 @@ bool InitSignalHandlerUnlocked(int signum) {
   if (signum != g_stack_trace_signum && state == INITIALIZED) {
     struct sigaction old_act;
     PCHECK(sigaction(g_stack_trace_signum, nullptr, &old_act) == 0);
-    if (old_act.sa_handler == &HandleStackTraceSignal) {
+    if (old_act.sa_sigaction == &HandleStackTraceSignal) {
       signal(g_stack_trace_signum, SIG_DFL);
     }
   }
@@ -277,7 +274,13 @@ bool InitSignalHandlerUnlocked(int signum) {
     } else {
       // No one appears to be using the signal. This is racy, but there is no
       // atomic swap capability.
-      sighandler_t old_handler = signal(g_stack_trace_signum, HandleStackTraceSignal);
+      struct sigaction act;
+      memset(&act, 0, sizeof(act));
+      act.sa_sigaction = &HandleStackTraceSignal;
+      act.sa_flags = SA_SIGINFO | SA_RESTART;
+      struct sigaction old_act;
+      CHECK_ERR(sigaction(g_stack_trace_signum, &act, &old_act));
+      sighandler_t old_handler = old_act.sa_handler;
       if (old_handler != SIG_IGN &&
           old_handler != SIG_DFL) {
         LOG(FATAL) << "raced against another thread installing a signal handler";
@@ -288,69 +291,160 @@ bool InitSignalHandlerUnlocked(int signum) {
   return state == INITIALIZED;
 }
 
-
 } // anonymous namespace
 
 Status SetStackTraceSignal(int signum) {
-  base::SpinLockHolder h(&g_dumper_thread_lock);
+  base::SpinLockHolder h(&g_signal_handler_lock);
   if (!InitSignalHandlerUnlocked(signum)) {
     return Status::InvalidArgument("unable to install signal handler");
   }
   return Status::OK();
 }
 
-Status GetThreadStack(int64_t tid, StackTrace* stack) {
-#if defined(__linux__)
-  base::SpinLockHolder h(&g_dumper_thread_lock);
+StackTraceCollector::StackTraceCollector(StackTraceCollector&& other) noexcept
+    : tid_(other.tid_),
+      sig_data_(other.sig_data_) {
+  other.tid_ = 0;
+  other.sig_data_ = nullptr;
+}
 
-  // Ensure that our signal handler is installed. We don't need any fancy GoogleOnce here
-  // because of the mutex above.
-  if (!InitSignalHandlerUnlocked(g_stack_trace_signum)) {
-    return Status::NotSupported("unable to take thread stack: signal handler unavailable");
+StackTraceCollector::~StackTraceCollector() {
+  if (sig_data_) {
+    RevokeSigData();
   }
+}
 
-  // Set the target TID in our communication structure, so if we end up with any
-  // delayed signal reaching some other thread, it will know to ignore it.
+#ifdef __linux__
+void StackTraceCollector::RevokeSigData() {
+  // We have several cases to consider. Though it involves some simple code
+  // duplication, each case is handled separately for clarity.
+
+  // 1) We have sent the signal, and the thread has completed the stack trace.
+  //    This this is the "happy path".
+  if (sig_data_->result_ready.complete()) {
+    delete sig_data_;
+    sig_data_ = nullptr;
+    return;
+  }
+
+  // 2) Timed out, but signal still pending and signal handler not yet invoked.
+  //    In this case, we need to make sure that when it does later get the signal,
+  //    it doesn't attempt to read 'sig_data_' after we've freed it. However
+  //    we can still safely free the stack trace object itself.
+  //
+  // 3) Timed out, but signal has been delivered and the stack tracing is in
+  //    progress. In this case we have to wait for it to finish. This case should
+  //    be very rare, since the trace collection itself is typically fast.
+  //
+  // We can distinguish between case (2) and (3) using the 'queued_to_tid' member:
+  int64_t old_val = sig_data_->queued_to_tid.exchange(SignalData::kNotInUse);
+
+  // In case (2), the signal handler hasn't started collecting a stack trace, so we
+  // were able to exchange it out and see that it was still "queued". In this case,
+  // if the signal later gets delivered, we can't free the sig_data_ struct itself.
+  // We intentionally leak it. Note, however, that when it does run, it will
+  // see that we exchanged out its tid from 'queued_to_tid' and therefore won't
+  // attempt to write into the stack_ structure.
+  //
+  // TODO(todd) instead of leaking, we can insert these lost structs into a
+  // global free-list, and then reuse them the next time we want to send a
+  // signal. The re-use is safe since access is limited to a specific tid.
+  if (old_val == tid_) {
+    DLOG(WARNING) << "Leaking SignalData structure " << sig_data_ << " after lost signal "
+                  << "to thread " << tid_;
+    ANNOTATE_LEAKING_OBJECT_PTR(sig_data_);
+    sig_data_ = nullptr;
+    return;
+  }
+
+  // In case (3), the signal handler started running but isn't complete yet.
+  // We have no choice but to await completion
+  CHECK(old_val == SignalData::kDumpInProgress);
+  CHECK(sig_data_->result_ready.WaitUntil(MonoTime::Max()));
+
+  delete sig_data_;
+  sig_data_ = nullptr;
+}
+
+
+Status StackTraceCollector::TriggerAsync(int64_t tid, StackTrace* stack) {
+  CHECK(!sig_data_ && tid_ == 0) << "TriggerAsync() must not be called more than once per instance";
+
+  // Ensure that our signal handler is installed.
   {
-    SignalCommunication::Lock l;
-    CHECK_EQ(0, g_comm.target_tid);
-    g_comm.target_tid = tid;
+    base::SpinLockHolder h(&g_signal_handler_lock);
+    if (!InitSignalHandlerUnlocked(g_stack_trace_signum)) {
+      return Status::NotSupported("unable to take thread stack: signal handler unavailable");
+    }
   }
 
+  std::unique_ptr<SignalData> data(new SignalData());
+  // Set the target TID in our communication structure, so if we end up with any
+  // delayed signal reaching some other thread, it will know to ignore it.
+  data->queued_to_tid = tid;
+  data->stack = CHECK_NOTNULL(stack);
+
   // We use the raw syscall here instead of kill() to ensure that we don't accidentally
   // send a signal to some other process in the case that the thread has exited and
   // the TID been recycled.
-  if (syscall(SYS_tgkill, getpid(), tid, g_stack_trace_signum) != 0) {
-    {
-      SignalCommunication::Lock l;
-      g_comm.target_tid = 0;
-    }
+  siginfo_t info;
+  memset(&info, 0, sizeof(info));
+  info.si_signo = g_stack_trace_signum;
+  info.si_code = SI_QUEUE;
+  info.si_pid = getpid();
+  info.si_uid = getuid();
+  info.si_value.sival_ptr = data.get();
+  // Since we're using a signal to pass information between the two threads,
+  // we need to help TSAN out and explicitly tell it about the happens-before
+  // relationship here.
+  ANNOTATE_HAPPENS_BEFORE(data.get());
+  if (syscall(SYS_rt_tgsigqueueinfo, getpid(), tid, g_stack_trace_signum, &info) != 0) {
     return Status::NotFound("unable to deliver signal: process may have exited");
   }
 
+  // The signal is now pending to the target thread. We don't store it in a unique_ptr
+  // inside the class since we need to be careful to destruct it safely in case the
+  // target thread hasn't yet received the signal when this instance gets destroyed.
+  sig_data_ = data.release();
+  tid_ = tid;
+
+  return Status::OK();
+}
+
+Status StackTraceCollector::AwaitCollection(MonoTime deadline) {
+  CHECK(sig_data_) << "Must successfully call TriggerAsync() first";
+
   // We give the thread ~1s to respond. In testing, threads typically respond within
   // a few milliseconds, so this timeout is very conservative.
   //
   // The main reason that a thread would not respond is that it has blocked signals. For
   // example, glibc's timer_thread doesn't respond to our signal, so we always time out
   // on that one.
-  string ret;
-
-  g_comm.result_ready.TimedWait(MonoDelta::FromSeconds(1));
-  {
-    SignalCommunication::Lock l;
-    CHECK_EQ(tid, g_comm.target_tid);
-    g_comm.target_tid = 0;
-    if (!g_comm.result_ready.complete()) {
-      return Status::TimedOut("(thread did not respond: maybe it is blocking signals)");
-    }
-    stack->CopyFrom(g_comm.stack);
-    g_comm.result_ready.Reset();
+  bool got_result = sig_data_->result_ready.WaitUntil(deadline);
+  RevokeSigData();
+  if (!got_result) {
+    return Status::TimedOut("thread did not respond: maybe it is blocking signals");
   }
+
   return Status::OK();
-#else // defined(__linux__)
+}
+
+#else  // __linux__
+Status StackTraceCollector::TriggerAsync(int64_t tid_, StackTrace* stack) {
+  return Status::NotSupported("unsupported platform");
+}
+Status StackTraceCollector::AwaitCollection() {
   return Status::NotSupported("unsupported platform");
-#endif
+}
+void StackTraceCollector::RevokeSigData() {
+}
+#endif // __linux__
+
+Status GetThreadStack(int64_t tid, StackTrace* stack) {
+  StackTraceCollector c;
+  RETURN_NOT_OK(c.TriggerAsync(tid, stack));
+  RETURN_NOT_OK(c.AwaitCollection(MonoTime::Now() + MonoDelta::FromSeconds(1)));
+  return Status::OK();
 }
 
 string DumpThreadStack(int64_t tid) {
@@ -362,8 +456,6 @@ string DumpThreadStack(int64_t tid) {
   return strings::Substitute("<$0>", s.ToString());
 }
 
-
-
 Status ListThreads(vector<pid_t> *tids) {
 #ifndef __linux__
   return Status::NotSupported("unable to list threads on this platform");

http://git-wip-us.apache.org/repos/asf/kudu/blob/c20b84bb/src/kudu/util/debug-util.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util.h b/src/kudu/util/debug-util.h
index c96f2d4..efe5857 100644
--- a/src/kudu/util/debug-util.h
+++ b/src/kudu/util/debug-util.h
@@ -24,13 +24,19 @@
 #include <string>
 #include <vector>
 
+#include "kudu/gutil/macros.h"
 #include "kudu/gutil/strings/fastmem.h"
 #include "kudu/util/status.h"
 
 namespace kudu {
 
+class MonoTime;
 class StackTrace;
 
+namespace stack_trace_internal {
+struct SignalData;
+}
+
 // Return true if coverage is enabled.
 bool IsCoverageBuild();
 
@@ -176,6 +182,45 @@ class StackTrace {
   void* frames_[kMaxFrames];
 };
 
+
+// Class to collect the stack trace of another thread within this process.
+// This allows for more advanced use cases than 'DumpThreadStack(tid)' above.
+// Namely, this provides an asynchronous trigger/collect API so that many
+// stack traces can be collected from many different threads in parallel using
+// different instances of this object.
+class StackTraceCollector {
+ public:
+  StackTraceCollector() = default;
+  StackTraceCollector(StackTraceCollector&& other) noexcept;
+  ~StackTraceCollector();
+
+  // Send the asynchronous request to the the thread with TID 'tid'
+  // to collect its stack trace into '*stack'.
+  //
+  // NOTE: 'stack' must remain a valid pointer until AwaitCollection() has
+  // completed.
+  //
+  // Returns OK if the signal was sent successfully.
+  Status TriggerAsync(int64_t tid, StackTrace* stack);
+
+  // Wait for the stack trace to be collected from the target thread.
+  //
+  // REQUIRES: TriggerAsync() has returned successfully.
+  Status AwaitCollection(MonoTime deadline);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(StackTraceCollector);
+
+  // Safely sets 'sig_data_' back to nullptr after having sent an asynchronous
+  // stack trace request. See implementation for details.
+  //
+  // POSTCONDITION: sig_data_ == nullptr
+  void RevokeSigData();
+
+  int64_t tid_ = 0;
+  stack_trace_internal::SignalData* sig_data_ = nullptr;
+};
+
 } // namespace kudu
 
 #endif


[2/2] kudu git commit: KUDU-2291 (part 6) stacks: use libunwind directly for stack tracing

Posted by to...@apache.org.
KUDU-2291 (part 6) stacks: use libunwind directly for stack tracing

Previously we used glog's wrapper around libunwind for stack tracing.
However that has a deficiency that it assumes that, process wide, only
one thread can be inside libunwind at a time[1]

It appears that this is left over from some very old versions of
libunwind, or was already unnecessarily conservative. libunwind is meant
to be thread safe, and we have tests that will trigger if it is not.

This just extracts the function body of the glog function we were using
and does the same work manually.

Without this fix, the "collect from all the threads at the same time"
code path resulted in most of the threads collecting an empty trace
since they tried to call libunwind at the same time.

[1] https://github.com/google/glog/issues/298
Change-Id: I3a53e55d7c4e7ee50bcac5b1e81267df56383634
Reviewed-on: http://gerrit.cloudera.org:8080/9319
Reviewed-by: Mike Percy <mp...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/0ee40b66
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/0ee40b66
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/0ee40b66

Branch: refs/heads/master
Commit: 0ee40b6618df235fa7d31eeba3caede93b2a3852
Parents: c20b84b
Author: Todd Lipcon <to...@apache.org>
Authored: Tue Feb 13 18:37:29 2018 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Feb 21 23:54:55 2018 +0000

----------------------------------------------------------------------
 build-support/iwyu/iwyu.sh                | 16 +++-----
 build-support/iwyu/mappings/libunwind.imp | 23 ++++++++++++
 src/kudu/util/debug-util-test.cc          |  2 +-
 src/kudu/util/debug-util.cc               | 51 +++++++++++++++++++++++++-
 src/kudu/util/debug-util.h                |  6 +--
 5 files changed, 81 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/0ee40b66/build-support/iwyu/iwyu.sh
----------------------------------------------------------------------
diff --git a/build-support/iwyu/iwyu.sh b/build-support/iwyu/iwyu.sh
index 6ee6287..417cdf3 100755
--- a/build-support/iwyu/iwyu.sh
+++ b/build-support/iwyu/iwyu.sh
@@ -44,18 +44,12 @@ for p in $file_list_tmp; do
   IWYU_FILE_LIST="$IWYU_FILE_LIST $ROOT/$p"
 done
 
+IWYU_ARGS="--max_line_length=256"
+
 IWYU_MAPPINGS_PATH="$ROOT/build-support/iwyu/mappings"
-IWYU_ARGS="\
-    --max_line_length=256 \
-    --mapping_file=$IWYU_MAPPINGS_PATH/boost-all.imp \
-    --mapping_file=$IWYU_MAPPINGS_PATH/boost-all-private.imp \
-    --mapping_file=$IWYU_MAPPINGS_PATH/boost-extra.imp \
-    --mapping_file=$IWYU_MAPPINGS_PATH/gflags.imp \
-    --mapping_file=$IWYU_MAPPINGS_PATH/glog.imp \
-    --mapping_file=$IWYU_MAPPINGS_PATH/gtest.imp \
-    --mapping_file=$IWYU_MAPPINGS_PATH/kudu.imp \
-    --mapping_file=$IWYU_MAPPINGS_PATH/libstdcpp.imp\
-    --mapping_file=$IWYU_MAPPINGS_PATH/system-linux.imp"
+for path in $IWYU_MAPPINGS_PATH/*.imp ; do
+    IWYU_ARGS="$IWYU_ARGS --mapping_file=$path"
+done
 
 if ! PATH="$PATH:$PWD/../../thirdparty/clang-toolchain/bin" \
     python $ROOT/build-support/iwyu/iwyu_tool.py -p . $IWYU_FILE_LIST -- \

http://git-wip-us.apache.org/repos/asf/kudu/blob/0ee40b66/build-support/iwyu/mappings/libunwind.imp
----------------------------------------------------------------------
diff --git a/build-support/iwyu/mappings/libunwind.imp b/build-support/iwyu/mappings/libunwind.imp
new file mode 100644
index 0000000..590e6c2
--- /dev/null
+++ b/build-support/iwyu/mappings/libunwind.imp
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#
+# This file contains mappings for libunwind
+[
+  { include: ["<libunwind-x86_64.h>", "private", "<libunwind.h>", "public"] },
+  { include: ["\"libunwind-x86_64.h\"", "private", "<libunwind.h>", "public"] }
+]

http://git-wip-us.apache.org/repos/asf/kudu/blob/0ee40b66/src/kudu/util/debug-util-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util-test.cc b/src/kudu/util/debug-util-test.cc
index abd7c4f..c51bf0c 100644
--- a/src/kudu/util/debug-util-test.cc
+++ b/src/kudu/util/debug-util-test.cc
@@ -51,7 +51,7 @@ class DebugUtilTest : public KuduTest {
 
 TEST_F(DebugUtilTest, TestStackTrace) {
   StackTrace t;
-  t.Collect(1);
+  t.Collect(0);
   string trace = t.Symbolize();
   ASSERT_STR_CONTAINS(trace, "kudu::DebugUtilTest_TestStackTrace_Test::TestBody");
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/0ee40b66/src/kudu/util/debug-util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util.cc b/src/kudu/util/debug-util.cc
index 0e1130b..8fd1575 100644
--- a/src/kudu/util/debug-util.cc
+++ b/src/kudu/util/debug-util.cc
@@ -38,6 +38,11 @@
 #include <string>
 
 #include <glog/logging.h>
+#include <glog/raw_logging.h>
+#ifdef __linux__
+#define UNW_LOCAL_ONLY
+#include <libunwind.h>
+#endif
 
 #include "kudu/gutil/dynamic_annotations.h"
 #include "kudu/gutil/hash/city.h"
@@ -48,10 +53,13 @@
 #include "kudu/gutil/strings/numbers.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/debug/leak_annotations.h"
+#ifndef __linux__
 #include "kudu/util/debug/sanitizer_scopes.h"
+#endif
 #include "kudu/util/debug/unwind_safeness.h"
 #include "kudu/util/errno.h"
 #include "kudu/util/monotime.h"
+#include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/thread.h"
 
 using std::string;
@@ -214,6 +222,12 @@ namespace {
 // Signal handler for our stack trace signal.
 // We expect that the signal is only sent from DumpThreadStack() -- not by a user.
 void HandleStackTraceSignal(int /*signum*/, siginfo_t* info, void* /*ucontext*/) {
+  // Signal handlers may be invoked at any point, so it's important to preserve
+  // errno.
+  int save_errno = errno;
+  SCOPED_CLEANUP({
+      errno = save_errno;
+    });
   auto* sig_data = reinterpret_cast<SignalData*>(info->si_ptr);
   DCHECK(sig_data);
   if (!sig_data) {
@@ -231,7 +245,7 @@ void HandleStackTraceSignal(int /*signum*/, siginfo_t* info, void* /*ucontext*/)
   }
   // Marking it as kDumpInProgress ensures that the caller thread must now wait
   // for our response, since we are writing directly into their StackTrace object.
-  sig_data->stack->Collect(/*skip_frames=*/2);
+  sig_data->stack->Collect(/*skip_frames=*/1);
   sig_data->queued_to_tid = SignalData::kNotInUse;
   sig_data->result_ready.Signal();
 }
@@ -520,10 +534,43 @@ void StackTrace::Collect(int skip_frames) {
     num_frames_ = 1;
     return;
   }
+  const int kMaxDepth = arraysize(frames_);
+
+#ifdef __linux__
+  unw_cursor_t cursor;
+  unw_context_t uc;
+  unw_getcontext(&uc);
+  RAW_CHECK(unw_init_local(&cursor, &uc) >= 0, "unw_init_local failed");
+  skip_frames++;         // Do not include the "Collect" frame
+
+  num_frames_ = 0;
+  while (num_frames_ < kMaxDepth) {
+    void *ip;
+    int ret = unw_get_reg(&cursor, UNW_REG_IP, reinterpret_cast<unw_word_t *>(&ip));
+    if (ret < 0) {
+      break;
+    }
+    if (skip_frames > 0) {
+      skip_frames--;
+    } else {
+      frames_[num_frames_++] = ip;
+    }
+    ret = unw_step(&cursor);
+    if (ret <= 0) {
+      break;
+    }
+  }
+#else
+  // On OSX, use the unwinder from glog. However, that unwinder has an issue where
+  // concurrent invocations will return no frames. See:
+  // https://github.com/google/glog/issues/298
+  // The worst result here is an empty result.
+
   // google::GetStackTrace has a data race. This is called frequently, so better
   // to ignore it with an annotation rather than use a suppression.
   debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;
-  num_frames_ = google::GetStackTrace(frames_, arraysize(frames_), skip_frames);
+  num_frames_ = google::GetStackTrace(frames_, kMaxDepth, skip_frames + 1);
+#endif
 }
 
 void StackTrace::StringifyToHex(char* buf, size_t size, int flags) const {

http://git-wip-us.apache.org/repos/asf/kudu/blob/0ee40b66/src/kudu/util/debug-util.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/debug-util.h b/src/kudu/util/debug-util.h
index efe5857..637e49f 100644
--- a/src/kudu/util/debug-util.h
+++ b/src/kudu/util/debug-util.h
@@ -132,11 +132,11 @@ class StackTrace {
   }
 
   // Collect and store the current stack trace. Skips the top 'skip_frames' frames
-  // from the stack. For example, a value of '1' will skip the 'Collect()' function
-  // call itself.
+  // from the stack. For example, a value of '1' will skip whichever function
+  // called the 'Collect()' function. The 'Collect' function itself is always skipped.
   //
   // This function is async-safe.
-  void Collect(int skip_frames = 1);
+  void Collect(int skip_frames = 0);
 
 
   enum Flags {