You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/08/08 23:36:58 UTC

[impala] 01/27: IMPALA-11275: log thread info during minidump

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 964c55f6878d02fa8239229969796da5a3cfbe4b
Author: Michael Smith <mi...@cloudera.com>
AuthorDate: Mon May 9 16:24:28 2022 -0700

    IMPALA-11275: log thread info during minidump
    
    Writes ThreadDebugInfo to stdout/stderr when a minidump is generated to
    capture thread and query details related to the dump. Example message:
    > Minidump in thread [1790536]async-exec-thread running query
      1a47cc1e2df94cb4:88dfa08200000000, fragment instance
      0000000000000000:0000000000000000
    
    Refactors DumpCallback so that repeated writes to STDOUT/STDERR are less
    redundant.
    
    Adds unit tests to run with ThreadDebugInfo. Removes the 'static' prefix
    from DumpCallback so it can be invoked from unit tests, but doesn't add
    it to the header as it's intended to be for internal use.
    
    Testing:
    - Added crash to Coordinator::Exec and manually tested dump handling.
    - Added a new unit test for DumpCallback.
    - Ran tests/custom_cluster/test_breakpad.py to verify nothing broke in
      refactor. Those tests don't have ThreadDebugInfo available.
    
    Change-Id: Iea2bdf10db29a0f8ccbe5e767b708781d42a9b8a
    Reviewed-on: http://gerrit.cloudera.org:8080/18508
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/util/CMakeLists.txt            |  2 +
 be/src/util/debug-util-test.cc        | 15 +++++++
 be/src/util/debug-util.cc             | 14 ++++++
 be/src/util/debug-util.h              |  9 +++-
 be/src/util/minidump-test.cc          | 83 +++++++++++++++++++++++++++++++++++
 be/src/util/minidump.cc               | 80 ++++++++++++++++++++++++++-------
 tests/custom_cluster/test_breakpad.py |  4 ++
 7 files changed, 190 insertions(+), 17 deletions(-)

diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt
index 69622a5b6..8aa4386de 100644
--- a/be/src/util/CMakeLists.txt
+++ b/be/src/util/CMakeLists.txt
@@ -164,6 +164,7 @@ add_library(UtilTests STATIC
   lru-multi-cache-test.cc
   metrics-test.cc
   min-max-filter-test.cc
+  minidump-test.cc
   openssl-util-test.cc
   os-info-test.cc
   os-util-test.cc
@@ -237,6 +238,7 @@ ADD_UNIFIED_BE_LSAN_TEST(lru-multi-cache-test "LruMultiCache.*")
 ADD_UNIFIED_BE_LSAN_TEST(logging-support-test "LoggingSupport.*")
 ADD_UNIFIED_BE_LSAN_TEST(metrics-test "MetricsTest.*")
 ADD_UNIFIED_BE_LSAN_TEST(min-max-filter-test "MinMaxFilterTest.*")
+ADD_UNIFIED_BE_LSAN_TEST(minidump-test "Minidump.*")
 ADD_UNIFIED_BE_LSAN_TEST(openssl-util-test "OpenSSLUtilTest.*")
 ADD_UNIFIED_BE_LSAN_TEST(os-info-test "OsInfo.*")
 ADD_UNIFIED_BE_LSAN_TEST(os-util-test "OsUtil.*")
diff --git a/be/src/util/debug-util-test.cc b/be/src/util/debug-util-test.cc
index 0b92c2785..417330d51 100644
--- a/be/src/util/debug-util-test.cc
+++ b/be/src/util/debug-util-test.cc
@@ -37,6 +37,21 @@ TEST(DebugUtil, UniqueID) {
   EXPECT_EQ("feedbeeff00d7777:0000000000000020", PrintId(unique_id));
 }
 
+TEST(DebugUtil, UniqueIDCompromised) {
+  TUniqueId unique_id;
+  unique_id.hi = 0xfeedbeeff00d7777ULL;
+  unique_id.lo = 0x2020202020202020ULL;
+  char out[TUniqueIdBufferSize+1];
+  out[TUniqueIdBufferSize] = '\0';
+
+  PrintIdCompromised(unique_id, out);
+  EXPECT_EQ(string("feedbeeff00d7777:2020202020202020"), out);
+
+  unique_id.lo = 0x20ULL;
+  PrintIdCompromised(unique_id, out);
+  EXPECT_EQ(string("feedbeeff00d7777:0000000000000020"), out);
+}
+
 string RecursionStack(int level) {
   if (level == 0) return GetStackTrace();
   return RecursionStack(level - 1);
diff --git a/be/src/util/debug-util.cc b/be/src/util/debug-util.cc
index 734476486..59d61c8b0 100644
--- a/be/src/util/debug-util.cc
+++ b/be/src/util/debug-util.cc
@@ -120,6 +120,20 @@ string PrintId(const UniqueIdPB& id, const string& separator) {
   return out.str();
 }
 
+static void my_i64tohex(int64_t w, char out[16]) {
+  static const char* digits = "0123456789abcdef";
+  for (size_t i = 0, j=60; i < 16; ++i, j -= 4) {
+    out[i] = digits[(w>>j) & 0x0f];
+  }
+}
+
+void PrintIdCompromised(const TUniqueId& id, char out[TUniqueIdBufferSize],
+    const char separator) {
+  my_i64tohex(id.hi, out);
+  out[16] = separator;
+  my_i64tohex(id.lo, out+17);
+}
+
 bool ParseId(const string& s, TUniqueId* id) {
   // For backwards compatibility, this method parses two forms of query ID from text:
   //  - <hex-int64_t><colon><hex-int64_t> - this format is the standard going forward
diff --git a/be/src/util/debug-util.h b/be/src/util/debug-util.h
index 5ae707444..e25ee92ad 100644
--- a/be/src/util/debug-util.h
+++ b/be/src/util/debug-util.h
@@ -87,11 +87,18 @@ std::string PrintThriftEnum(const TParquetBloomFilterWrite::type& value);
 std::string PrintTuple(const Tuple* t, const TupleDescriptor& d);
 std::string PrintRow(TupleRow* row, const RowDescriptor& d);
 std::string PrintBatch(RowBatch* batch);
-/// Converts id to a string represantation. If necessary, the gdb equivalent is:
+/// Converts id to a string representation. If necessary, the gdb equivalent is:
 ///    printf "%lx:%lx\n", id.hi, id.lo
 std::string PrintId(const TUniqueId& id, const std::string& separator = ":");
 std::string PrintId(const UniqueIdPB& id, const std::string& separator = ":");
 
+/// Converts id to a string representation without using any shared library calls.
+/// Follows Breakpad's guidance for compromised contexts, see
+/// https://github.com/google/breakpad/blob/main/docs/linux_starter_guide.md
+constexpr int TUniqueIdBufferSize = 33;
+void PrintIdCompromised(const TUniqueId& id, char out[TUniqueIdBufferSize],
+    const char separator = ':');
+
 inline ostream& operator<<(ostream& os, const UniqueIdPB& id) {
   return os << PrintId(id);
 }
diff --git a/be/src/util/minidump-test.cc b/be/src/util/minidump-test.cc
new file mode 100644
index 000000000..afcf4a883
--- /dev/null
+++ b/be/src/util/minidump-test.cc
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "util/minidump.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/regex.hpp>
+#include <gtest/gtest.h>
+
+#include "client/linux/handler/minidump_descriptor.h"
+#include "common/thread-debug-info.h"
+
+namespace impala {
+
+bool DumpCallback(const google_breakpad::MinidumpDescriptor& descriptor, void* context,
+    bool succeeded);
+
+TEST(Minidump, DumpCallback) {
+  testing::internal::CaptureStdout();
+  testing::internal::CaptureStderr();
+
+  google_breakpad::MinidumpDescriptor descriptor("/tmp/arbitrary/path");
+  descriptor.UpdatePath();
+  DumpCallback(descriptor, nullptr, true);
+
+  std::string stdout = testing::internal::GetCapturedStdout();
+  std::string stderr = testing::internal::GetCapturedStderr();
+  boost::regex wrote_minidump("Wrote minidump to /tmp/arbitrary/path/.*\\.dmp");
+
+  for (std::string output : {stdout, stderr}) {
+    std::vector<std::string> lines;
+    boost::split(lines, output, boost::is_any_of("\n\r"), boost::token_compress_on);
+    EXPECT_EQ(3, lines.size());
+    EXPECT_EQ("Minidump with no thread info available.", lines[0]);
+    EXPECT_TRUE(boost::regex_match(lines[1], wrote_minidump));
+    EXPECT_EQ("", lines[2]);
+  }
+}
+
+TEST(Minidump, DumpCallbackWithThread) {
+  testing::internal::CaptureStdout();
+  testing::internal::CaptureStderr();
+
+  ThreadDebugInfo tdi;
+  TUniqueId query, instance;
+  std::tie(query.hi, query.lo, instance.hi, instance.lo) = std::make_tuple(1, 2, 3, 4);
+  tdi.SetQueryId(query);
+  tdi.SetInstanceId(instance);
+  google_breakpad::MinidumpDescriptor descriptor("/tmp/arbitrary/path");
+  descriptor.UpdatePath();
+  DumpCallback(descriptor, nullptr, true);
+
+  std::string stdout = testing::internal::GetCapturedStdout();
+  std::string stderr = testing::internal::GetCapturedStderr();
+  boost::regex minidump_in_thread("Minidump in thread \\[.*\\] running query "
+      "0{15}1:0{15}2, fragment instance 0{15}3:0{15}4");
+  boost::regex wrote_minidump("Wrote minidump to /tmp/arbitrary/path/.*\\.dmp");
+
+  for (std::string output : {stdout, stderr}) {
+    std::vector<std::string> lines;
+    boost::split(lines, output, boost::is_any_of("\n\r"), boost::token_compress_on);
+    EXPECT_EQ(3, lines.size());
+    EXPECT_TRUE(boost::regex_match(lines[0], minidump_in_thread));
+    EXPECT_TRUE(boost::regex_match(lines[1], wrote_minidump));
+    EXPECT_EQ("", lines[2]);
+  }
+}
+
+} // namespace impala
diff --git a/be/src/util/minidump.cc b/be/src/util/minidump.cc
index f09439f70..25a329f80 100644
--- a/be/src/util/minidump.cc
+++ b/be/src/util/minidump.cc
@@ -32,7 +32,9 @@
 #include <sstream>
 
 #include "common/logging.h"
+#include "common/thread-debug-info.h"
 #include "common/version.h"
+#include "util/debug-util.h"
 #include "util/filesystem-util.h"
 #include "util/time.h"
 
@@ -64,31 +66,77 @@ static bool FilterCallback(void* context) {
   return minidumps_enabled;
 }
 
+static void write_dump_threadinfo(int fd, ThreadDebugInfo* thread_info) {
+  constexpr char thread_msg[] = "Minidump in thread ";
+  // pid_t is signed, but valid process IDs must always be positive.
+  const int64_t thread_id = thread_info->GetSystemThreadId();
+  // 20 characters needed for UINT64_MAX.
+  char thread_id_str[20];
+  const unsigned int thread_id_len = my_uint_len(thread_id);
+  my_uitos(thread_id_str, thread_id, thread_id_len);
+  const char* thread_name = thread_info->GetThreadName();
+
+  constexpr char query_msg[] = " running query ";
+  const TUniqueId& query_id = thread_info->GetQueryId();
+  char query_id_str[TUniqueIdBufferSize];
+  PrintIdCompromised(query_id, query_id_str);
+
+  constexpr char instance_msg[] = ", fragment instance ";
+  const TUniqueId& instance_id = thread_info->GetInstanceId();
+  // Format TUniqueId according to PrintId from util/debug-util.h
+  char instance_id_str[TUniqueIdBufferSize];
+  PrintIdCompromised(instance_id, instance_id_str);
+
+  // Example:
+  // > Minidump in thread [1790536]async-exec-thread running query 1a47cc1e2df94cb4:
+  //   88dfa08200000000, fragment instance 0000000000000000:0000000000000000
+  sys_write(fd, thread_msg, sizeof(thread_msg) / sizeof(thread_msg[0]) - 1);
+  sys_write(fd, "[", 1);
+  sys_write(fd, thread_id_str, thread_id_len);
+  sys_write(fd, "]", 1);
+  sys_write(fd, thread_name, my_strlen(thread_name));
+  sys_write(fd, query_msg, sizeof(query_msg) / sizeof(query_msg[0]) - 1);
+  sys_write(fd, query_id_str, TUniqueIdBufferSize);
+  sys_write(fd, instance_msg, sizeof(instance_msg) / sizeof(instance_msg[0]) - 1);
+  sys_write(fd, instance_id_str, TUniqueIdBufferSize);
+  sys_write(fd, "\n", 1);
+}
+
+static void write_dump_path(int fd, const char* path) {
+  // We use the linux syscall support methods from chromium here as per the
+  // recommendation of the breakpad docs to avoid calling into other shared libraries.
+  const char msg[] = "Wrote minidump to ";
+  sys_write(fd, msg, sizeof(msg) / sizeof(msg[0]) - 1);
+    // We use breakpad's reimplementation of strlen to avoid calling into libc.
+  sys_write(fd, path, my_strlen(path));
+  sys_write(fd, "\n", 1);
+}
+
 /// Callback for breakpad. It is called by breakpad whenever a minidump file has been
 /// written and should not be called directly. It logs the event before breakpad crashes
 /// the process. Due to the process being in a failed state we write to stdout/stderr and
 /// let the surrounding redirection make sure the output gets logged. The calls might
 /// still fail in unknown scenarios as the process is in a broken state. However we don't
 /// rely on them as the minidump file has been written already.
-static bool DumpCallback(const google_breakpad::MinidumpDescriptor& descriptor,
-    void* context, bool succeeded) {
+bool DumpCallback(const google_breakpad::MinidumpDescriptor& descriptor, void* context,
+    bool succeeded) {
   // See if a file was written successfully.
   if (succeeded) {
-    // Write message to stdout/stderr, which will usually be captured in the INFO/ERROR
-    // log.
-    const char msg[] = "Wrote minidump to ";
-    const int msg_len = sizeof(msg) / sizeof(msg[0]) - 1;
+    // Write to stdout/stderr, which will usually be captured in the INFO/ERROR log.
+    ThreadDebugInfo* thread_info = GetThreadDebugInfo();
+    if (thread_info != nullptr) {
+      write_dump_threadinfo(STDOUT_FILENO, thread_info);
+      write_dump_threadinfo(STDERR_FILENO, thread_info);
+    } else {
+      const char msg[] = "Minidump with no thread info available.\n";
+      const int msg_len = sizeof(msg) / sizeof(msg[0]) - 1;
+      sys_write(STDOUT_FILENO, msg, msg_len);
+      sys_write(STDERR_FILENO, msg, msg_len);
+    }
+
     const char* path = descriptor.path();
-    // We use breakpad's reimplementation of strlen to avoid calling into libc.
-    const int path_len = my_strlen(path);
-    // We use the linux syscall support methods from chromium here as per the
-    // recommendation of the breakpad docs to avoid calling into other shared libraries.
-    sys_write(STDOUT_FILENO, msg, msg_len);
-    sys_write(STDOUT_FILENO, path, path_len);
-    sys_write(STDOUT_FILENO, "\n", 1);
-    sys_write(STDERR_FILENO, msg, msg_len);
-    sys_write(STDERR_FILENO, path, path_len);
-    sys_write(STDERR_FILENO, "\n", 1);
+    write_dump_path(STDOUT_FILENO, path);
+    write_dump_path(STDERR_FILENO, path);
   }
   // Return the value received in the call as described in the minidump documentation. If
   // this values is true, then no other handlers will be called. Breakpad will still crash
diff --git a/tests/custom_cluster/test_breakpad.py b/tests/custom_cluster/test_breakpad.py
index 033e4790d..d106093ef 100644
--- a/tests/custom_cluster/test_breakpad.py
+++ b/tests/custom_cluster/test_breakpad.py
@@ -143,6 +143,10 @@ class TestBreakpadBase(CustomClusterTestSuite):
         expected_count=expected_count)
     self.assert_impalad_log_contains('ERROR', 'Wrote minidump to ',
         expected_count=expected_count)
+    self.assert_impalad_log_contains('INFO', 'Minidump with no thread info available.',
+        expected_count=expected_count)
+    self.assert_impalad_log_contains('ERROR', 'Minidump with no thread info available.',
+        expected_count=expected_count)
 
 
 class TestBreakpadCore(TestBreakpadBase):