You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/10/13 16:23:29 UTC
[impala] 01/03: IMPALA-11226: Add script to simplify resolving minidumps

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c3d7f20a890ad6c142aaee574da51070dee5d44e
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Mon Aug 22 19:57:13 2022 -0700

    IMPALA-11226: Add script to simplify resolving minidumps
    
    This adds the resolve_minidumps.py script to
    simplify resolving minidumps under ideal circumstances.
    This is designed to handle cases where the binary
    and libraries are in identical locations to when
    the minidump was created. This is true for developer
    environments and at the end of Jenkins jobs.
    
    This uses Breakpad's minidump_dump utility to get a
    list of the binaries/libraries that the minidump
    references. It uses that list to dump all the
    symbols to a temporary directory. Then it uses
    the symbols to resolve the minidump.
    
    Since it is dumping symbols for all referenced
    libraries, it resolves symbols to the maximum
    extent possible.
    
    This adds a step to bin/jenkins/finalize.sh to use
    this new script to resolve minidumps. The old method
    can be removed in a subsequent change.
    
    Testing:
     - Ran locally on a minidump generated by sending
       SIGUSR1 to local impalad
     - Tested with a Centos 7 job using Python 3.6
       and verified the minidump output
     - Tested resolving a minidump from a binary with
       compressed debug info
    
    Change-Id: I0f8fdcb8ca89d0904dc8ec69337e3d5dfdd54adf
    Reviewed-on: http://gerrit.cloudera.org:8080/18918
    Reviewed-by: Wenzhe Zhou <wz...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/jenkins/finalize.sh  |  19 +++
 bin/resolve_minidumps.py | 376 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 395 insertions(+)

diff --git a/bin/jenkins/finalize.sh b/bin/jenkins/finalize.sh
index 0785a9bcf..2057b7306 100755
--- a/bin/jenkins/finalize.sh
+++ b/bin/jenkins/finalize.sh
@@ -59,6 +59,25 @@ if [[ $(find $LOGS_DIR -path "*minidumps*" -name "*dmp") ]]; then
   rm -rf $SYM_DIR
 fi
 
+# Do a second pass over the minidumps with the resolve_minidump.py script.
+# This means that we now generate two JUnitXMLs per minidump. This should
+# be temporary.
+# TODO: Once we verify everything works, we can remove the first loop.
+if [[ $(find $LOGS_DIR -path "*minidumps*" -name "*dmp") ]]; then
+  for minidump in $(find $LOGS_DIR -path "*minidumps*" -name "*dmp"); do
+    # Since this is experimental, use it inside an if so that any error code doesn't
+    # abort this script.
+    if ! bin/resolve_minidumps.py --minidump_file ${minidump} \
+        --output_file ${minidump}_dumpedv2 ; then
+      echo "bin/resolve_minidumps.py failed!"
+    else
+      "${IMPALA_HOME}"/bin/generate_junitxml.py --phase finalize --step minidumpsv2 \
+          --error "Minidump generated: $minidump" \
+          --stderr "resolve_minidumps.py output:\n$(head -n 100 ${minidump}_dumpedv2)"
+    fi
+  done
+fi
+
 function check_for_asan_error {
   ERROR_LOG=${1}
   if grep -q "ERROR: AddressSanitizer:" ${ERROR_LOG} ; then
diff --git a/bin/resolve_minidumps.py b/bin/resolve_minidumps.py
new file mode 100755
index 000000000..0a0ba0c61
--- /dev/null
+++ b/bin/resolve_minidumps.py
@@ -0,0 +1,376 @@
+#!/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# This script automates symbol resolution for Breakpad minidumps
+# under ideal circumstances. Specifically, it expects all the
+# binaries to be in the same locations as when the minidump
+# was taken. This is often true for minidumps on a developer
+# workstation or at the end of an Impala test job. It finds Breakpad
+# using environment variables from the Impala dev environment,
+# so it must run inside the Impala dev environment.
+# TODO: It may be possible to extend this to Docker images.
+#
+# Within this simple context, this script aims for complete
+# symbol resolution. It uses Breakpad's minidump_dump utility
+# to dump the minidump, then it parses the list of libraries
+# that were used by the binary. It gets the symbols for all
+# those libraries and resolves the minidump.
+#
+# Usage: resolve_minidump.py --minidump_file [file] --output_file [file]
+# (optional -v or --verbose for more output)
+
+import errno
+import logging
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+
+from argparse import ArgumentParser
+
+
+class ModuleInfo:
+  def __init__(self, code_file, code_id, debug_file, debug_id):
+    self.code_file = code_file
+    self.code_id = code_id
+    self.debug_file = debug_file
+    self.debug_id = debug_id
+
+
+def read_module_info(minidump_dump_contents):
+  """Read the module information out of the minidump_dump raw contents.
+  This is expecting 'minidump_dump_contents' to be the minidump_dump
+  contents for the minidump split by newlines.
+  This will return a list of ModuleInfo objects.
+  """
+  # Find the module_count
+  for idx, line in enumerate(minidump_dump_contents):
+    if line.strip().startswith("module_count"):
+      module_count = int(line.split("=")[1].strip())
+      break
+
+  # The minidump has a MDRawModule per module and it will have
+  # the same number of MDRawModule dumps as module_count.
+  module_boundaries = []
+  for idx, line in enumerate(minidump_dump_contents):
+    if line.startswith("MDRawModule"):
+      module_boundaries.append(idx)
+
+  if len(module_boundaries) != module_count:
+    logging.error("Failed to parse modules, mismatch in module count "
+                  "({0} != {1})".format(len(module_boundaries), module_count))
+    return None
+
+  # Add one more entry to module_boundaries that is the end of the file
+  # That makes this more of a list of boundaries than the list of
+  # start locations.
+  module_boundaries.append(len(minidump_dump_contents))
+
+  modules = []
+  for module_idx in range(module_count):
+    module_start = module_boundaries[module_idx]
+    module_end = module_boundaries[module_idx + 1]
+
+    # Find the code_file
+    code_file = None
+    code_identifier = None
+    debug_file = None
+    debug_identifier = None
+    for line in minidump_dump_contents[module_start:module_end]:
+      if line.find("code_file") != -1:
+        code_file = line.split("=")[1].strip().strip('"')
+      elif line.find("code_identifier") != -1:
+        code_identifier = line.split("=")[1].strip().strip('"')
+      elif line.find("debug_file") != -1:
+        debug_file = line.split("=")[1].strip().strip('"')
+      elif line.find("debug_identifier") != -1:
+        debug_identifier = line.split("=")[1].strip().strip('"')
+
+    # Important: it is ok for the fields to be the zero-length string.
+    # We just care that they are non-None (i.e. the loop above encountered
+    # them and parsed a value).
+    if code_file is None or code_identifier is None or debug_file is None or \
+       debug_identifier is None:
+      logging.error("Failed to parse dump output, missing fields for MDRawModule "
+                    "{0}".format(module_idx))
+      return None
+
+    # Jars and other files show up in this list, but they have
+    # code identifiers or debug identifiers as all zeros. Skip those,
+    # as there are no symbols to find.
+    if re.fullmatch("[0]+", code_identifier) or re.fullmatch("[0]+", debug_identifier):
+      continue
+
+    # Skip cases where the code identifier or debug identifier are null
+    if len(code_identifier) == 0 or len(debug_identifier) == 0:
+      continue
+
+    # linux-gate.so is a special case, and it is not an actual file on disk.
+    if code_file.startswith("linux-gate.so"):
+      continue
+
+    modules.append(ModuleInfo(code_file, code_identifier, debug_file, debug_identifier))
+
+  return modules
+
+
+def find_breakpad_home():
+  """Locate the Breakpad home directory.
+
+  We try to locate the package in the Impala toolchain folder.
+  """
+  toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME')
+  if not toolchain_packages_home:
+    logging.error("IMPALA_TOOLCHAIN_PACKAGES_HOME is not set")
+    return None
+
+  if not os.path.isdir(toolchain_packages_home):
+    logging.error("Could not find toolchain packages directory")
+    return None
+  breakpad_version = os.environ.get('IMPALA_BREAKPAD_VERSION')
+  if not breakpad_version:
+    logging.error("Could not determine breakpad version from toolchain")
+    return None
+  breakpad_dir = '{0}/breakpad-{1}'.format(toolchain_packages_home, breakpad_version)
+  if not os.path.isdir(breakpad_dir):
+    logging.error("Could not find breakpad directory")
+    return None
+
+  return breakpad_dir
+
+
+def find_breakpad_binary(binary_name):
+  """Locate the specified Breadpad binary"""
+  breakpad_home = find_breakpad_home()
+  if not breakpad_home:
+    return None
+
+  binary_path = os.path.join(breakpad_home, 'bin', binary_name)
+  if not os.path.isfile(binary_path):
+    logging.error("Could not find {0} executable at {1}".format(binary_name, binary_path))
+    return None
+
+  return binary_path
+
+
+def find_objcopy_binary():
+  """Locate the 'objcopy' binary from Binutils.
+
+  We try to locate the package in the Impala toolchain folder.
+  TODO: Fall back to finding objcopy in the system path.
+  """
+  toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME')
+  if not toolchain_packages_home:
+    logging.error("IMPALA_TOOLCHAIN_PACKAGES_HOME is not set")
+    return None
+
+  if not os.path.isdir(toolchain_packages_home):
+    logging.error("Could not find toolchain packages directory")
+    return None
+  binutils_version = os.environ.get('IMPALA_BINUTILS_VERSION')
+  if not binutils_version:
+    logging.error("Could not determine binutils version from toolchain")
+    return None
+  binutils_dir = "binutils-{0}".format(binutils_version)
+  objcopy = os.path.join(toolchain_packages_home, binutils_dir, 'bin', 'objcopy')
+  if not os.path.isfile(objcopy):
+    logging.error("Could not find objcopy executable at {0}".format(objcopy))
+    return None
+  return objcopy
+
+
+def ensure_dir_exists(path):
+  """Make sure the directory 'path' exists in a thread-safe way."""
+  try:
+    os.makedirs(path)
+  except OSError as e:
+    if e.errno != errno.EEXIST or not os.path.isdir(path):
+      raise e
+
+
+def dump_symbols_for_binary(dump_syms, objcopy, binary, out_dir):
+  """Dump symbols of a single binary file and move the result.
+
+  Symbols will be extracted to a temporary file and moved into place afterwards. Required
+  directories will be created if necessary.
+  """
+  logging.info("Processing binary file: {0}".format(binary))
+  ensure_dir_exists(out_dir)
+  # tmp_fd will be closed when the file object created by os.fdopen() below gets
+  # destroyed.
+  tmp_fd, tmp_file = tempfile.mkstemp(dir=out_dir, suffix='.sym')
+  try:
+    # Create a temporary directory used for decompressing debug info
+    tempdir = tempfile.mkdtemp()
+
+    # Binaries can contain compressed debug symbols. Breakpad currently
+    # does not support dumping symbols for binaries with compressed debug
+    # symbols.
+    #
+    # As a workaround, this uses objcopy to create a copy of the binary with
+    # the debug symbols decompressed. If the debug symbols are not compressed
+    # in the original binary, objcopy simply makes a copy of the binary.
+    # Breakpad is able to read symbols from the decompressed binary, and
+    # those symbols work correctly in resolving a minidump from the original
+    # compressed binary.
+    # TODO: In theory, this could work with the binary.debug_path.
+    binary_basename = os.path.basename(binary)
+    decompressed_binary = os.path.join(tempdir, binary_basename)
+    objcopy_retcode = subprocess.call([objcopy, "--decompress-debug-sections",
+                                       binary, decompressed_binary])
+
+    # Run dump_syms on the binary
+    # If objcopy failed for some reason, fall back to running dump_syms
+    # directly on the original binary. This is unlikely to work, but it is a way of
+    # guaranteeing that objcopy is not the problem.
+    args = [dump_syms, decompressed_binary]
+    if objcopy_retcode != 0:
+      sys.stderr.write('objcopy failed. Trying to run dump_sym directly.\n')
+      args = [dump_syms, binary]
+
+    # Run dump_syms on the binary.
+    proc = subprocess.Popen(args, stdout=os.fdopen(tmp_fd, 'wb'), stderr=subprocess.PIPE)
+    _, stderr = proc.communicate()
+    if proc.returncode != 0:
+      sys.stderr.write('Failed to dump symbols from %s, return code %s\n' %
+          (binary, proc.returncode))
+      sys.stderr.write(stderr.decode('utf-8'))
+      os.remove(tmp_file)
+      return False
+    # Parse the temporary file to determine the full target path.
+    with open(tmp_file, 'r') as f:
+      header = f.readline().strip()
+      # Format of header is: MODULE os arch binary_id binary
+      _, _, _, binary_id, binary = header.split(' ')
+      out_path = os.path.join(out_dir, binary, binary_id)
+      ensure_dir_exists(out_path)
+    # Move the temporary file to its final destination.
+    shutil.move(tmp_file, os.path.join(out_path, '%s.sym' % binary))
+  except Exception as e:
+    # Only need to clean up in case of errors.
+    try:
+      os.remove(tmp_file)
+    except EnvironmentError:
+      pass
+    raise e
+  finally:
+    # Cleanup temporary directory
+    shutil.rmtree(tempdir)
+  return True
+
+
+def dump_symbols_for_all_modules(dump_syms, objcopy, module_list, out_dir):
+  """Given a list of modules (ModuleInfo objects), dump symbols for
+  each library listed.
+  """
+  for module in module_list:
+    success = dump_symbols_for_binary(dump_syms, objcopy, module.code_file, out_dir)
+    if not success:
+      logging.warning("Failed to dump symbols for {0}".format(module.code_file))
+
+
+def resolve_minidump(minidump_stackwalk, minidump_path, symbol_dir, verbose, out_file):
+  with open(out_file, "w") as out_f:
+    stderr_output = None if verbose else subprocess.DEVNULL
+    subprocess.run([minidump_stackwalk, minidump_path, symbol_dir], stdout=out_f,
+                   stderr=stderr_output, check=True)
+
+
+def raw_dump_for_minidump(minidump_dump, minidump_path):
+  """Run minidump_dump on the specified minidump and split the output into lines"""
+  # minidump_dump sometimes returns an error code even though it produced usable output.
+  # So, this doesn't check the error code, and it relies on read_module_info() doing
+  # validation.
+  #
+  # Python 3.6 adjustments:
+  # 'capture_output=True' not supported: set stdout/stderr to subprocess.PIPE instead
+  # 'text=True' not supported: set 'universal_newlines=True' (the two are the same thing)
+  output = subprocess.run([minidump_dump, minidump_path], stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE, universal_newlines=True)
+  return output.stdout.split('\n')
+
+
+def parse_args():
+  """Parse command line arguments and perform sanity checks."""
+  # TODO:
+  #  - Add ability to specify Breakpad home
+  #  - Add ability to specify the symbol directory location (for reuse)
+  #  - Add ability to specify Binutils home
+  parser = ArgumentParser()
+  parser.add_argument('--minidump_file', required=True)
+  parser.add_argument('--output_file', required=True)
+  parser.add_argument('-v', '--verbose', action='store_true')
+  args = parser.parse_args()
+  return args
+
+
+def main():
+  args = parse_args()
+
+  if args.verbose:
+    logging.basicConfig(level=logging.INFO)
+  else:
+    logging.basicConfig(level=logging.WARNING)
+
+  # Step 1: Get the raw dump for the specified minidump
+  minidump_dump_bin = find_breakpad_binary("minidump_dump")
+  if not minidump_dump_bin:
+    logging.error("Could not find Breakpad minidump_dump binary")
+    sys.exit(1)
+  contents = raw_dump_for_minidump(minidump_dump_bin, args.minidump_file)
+  if not contents:
+    logging.error(
+      "minidump_dump could not get the contents of {0}".format(args.minidump_file))
+    sys.exit(1)
+
+  # Step 2: Parse the raw dump to get the list of code modules
+  # This is the list of things that have symbols we need to dump.
+  modules = read_module_info(contents)
+  if not modules:
+    logging.error("Failed to read modules for {0}".format(args.minidump_file))
+    sys.exit(1)
+
+  # Create a temporary directory to store the symbols.
+  # This automatically gets cleaned up.
+  with tempfile.TemporaryDirectory() as tmp_dir:
+    # Step 3: Dump symbols for all the modules into this temporary directory.
+    # Need both dump_syms and objcopy
+    dump_syms_bin = find_breakpad_binary("dump_syms")
+    if not dump_syms_bin:
+      logging.error("Could not find Breakpad dump_syms binary")
+      sys.exit(1)
+    objcopy_bin = find_objcopy_binary()
+    if not objcopy_bin:
+      logging.error("Could not find Binutils objcopy binary")
+      sys.exit(1)
+    dump_symbols_for_all_modules(dump_syms_bin, objcopy_bin, modules, tmp_dir)
+
+    # Step 4: Resolve the minidump with the temporary symbol directory
+    minidump_stackwalk_bin = find_breakpad_binary("minidump_stackwalk")
+    if not minidump_stackwalk_bin:
+      logging.error("Could not find Breakpad minidump_stackwalk binary")
+      sys.exit(1)
+    resolve_minidump(find_breakpad_binary("minidump_stackwalk"), args.minidump_file,
+                     tmp_dir, args.verbose, args.output_file)
+
+
+if __name__ == "__main__":
+  main()