You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by mi...@apache.org on 2024/01/08 19:06:35 UTC

(impala) branch master updated (c3f875eac -> 3bcd770df)

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


    from c3f875eac IMPALA-12683: Fix wrong event time for batched events
     new c0a015fda IMPALA-12643 (part 1): Limit memory consumption for resolve_minidumps.py
     new dac7f409b IMPALA-12643 (part 2): Fallback to safe libraries on error in resolve_minidumps.py
     new fdd928563 IMPALA-11909: Use absolute path when calling resolve_minidumps.py
     new 3bcd770df IMPALA-10048: Go parallel for dump_breakpad_symbols.py

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 bin/dump_breakpad_symbols.py |  20 ++++++--
 bin/jenkins/finalize.sh      |   2 +-
 bin/resolve_minidumps.py     | 113 ++++++++++++++++++++++++++++++++++---------
 3 files changed, 107 insertions(+), 28 deletions(-)

(impala) 02/04: IMPALA-12643 (part 2): Fallback to safe libraries on error in resolve_minidumps.py

Posted by mi...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit dac7f409ba0619180835ea908349ac5108a58c3a
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Sun Dec 17 13:59:35 2023 -0800

    IMPALA-12643 (part 2): Fallback to safe libraries on error in resolve_minidumps.py
    
    Since resolve_minidumps.py's call to minidump_stackwalk can go haywire
    due to bad symbols in shared libraries, this adds a fallback mechanism
    where it tries again with a "safe" list of shared libraries. These are
    limited to the ones that make the most difference in resolving minidumps
    (libc, libstdc++, and libjvm). The list of safe libraries can be
    customized via the --safe_library_list.
    
    Testing:
     - Verified that this uses the fallback on Centos 7 and resolves
       the minidumps successfully.
    
    Change-Id: I6bb4c9f65f9c27bb3b86c7ff2f3a6a48e258ef01
    Reviewed-on: http://gerrit.cloudera.org:8080/20863
    Reviewed-by: Michael Smith <mi...@cloudera.com>
    Tested-by: Joe McDonnell <jo...@cloudera.com>
---
 bin/resolve_minidumps.py | 96 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 74 insertions(+), 22 deletions(-)

diff --git a/bin/resolve_minidumps.py b/bin/resolve_minidumps.py
index f410fabcf..3c64d9d55 100755
--- a/bin/resolve_minidumps.py
+++ b/bin/resolve_minidumps.py
@@ -43,6 +43,7 @@ import shutil
 import subprocess
 import sys
 import tempfile
+import traceback
 
 from argparse import ArgumentParser
 
@@ -132,6 +133,26 @@ def read_module_info(minidump_dump_contents):
   return modules
 
 
+def filter_shared_library_modules(module_list, lib_allow_list):
+  """Filter the list of modules by eliminating any shared libaries that do not match
+  one of the prefixes in the allow list. This keeps all non-shared libaries
+  (such as the main binary).
+  """
+  filtered_module_list = []
+  for module in module_list:
+    code_file_basename = os.path.basename(module.code_file)
+    # Keep anything that is not a shared library (e.g. the main binary)
+    if ".so" not in code_file_basename:
+      filtered_module_list.append(module)
+      continue
+    # Only keep shared libraries that match an entry on the allow list.
+    for allow_lib in lib_allow_list:
+      if code_file_basename.startswith(allow_lib):
+        filtered_module_list.append(module)
+        break
+  return filtered_module_list
+
+
 def find_breakpad_home():
   """Locate the Breakpad home directory.
 
@@ -331,10 +352,39 @@ def parse_args():
   parser.add_argument('--minidump_file', required=True)
   parser.add_argument('--output_file', required=True)
   parser.add_argument('-v', '--verbose', action='store_true')
+  parser.add_argument('--safe_library_list',
+      default="libstdc++.so,libc.so,libjvm.so",
+      help="Comma-separate list of prefixes for allowed system libraries")
   args = parser.parse_args()
   return args
 
 
+def dump_syms_and_resolve_stack(modules, minidump_file, output_file, verbose):
+  """Dump the symbols for the listed modules and use them to resolve the minidump."""
+  # Create a temporary directory to store the symbols
+  # This automatically gets cleaned up
+  with tempfile.TemporaryDirectory() as tmp_dir:
+    # Dump symbols for all the modules into this temporary directory.
+    # Need both dump_syms and objcopy
+    dump_syms_bin = find_breakpad_binary("dump_syms")
+    if not dump_syms_bin:
+      logging.error("Could not find Breakpad dump_syms binary")
+      sys.exit(1)
+    objcopy_bin = find_objcopy_binary()
+    if not objcopy_bin:
+      logging.error("Could not find Binutils objcopy binary")
+      sys.exit(1)
+    dump_symbols_for_all_modules(dump_syms_bin, objcopy_bin, modules, tmp_dir)
+
+    # Resolve the minidump with the temporary symbol directory
+    minidump_stackwalk_bin = find_breakpad_binary("minidump_stackwalk")
+    if not minidump_stackwalk_bin:
+      logging.error("Could not find Breakpad minidump_stackwalk binary")
+      sys.exit(1)
+    resolve_minidump(find_breakpad_binary("minidump_stackwalk"), minidump_file,
+                     tmp_dir, verbose, output_file)
+
+
 def main():
   args = parse_args()
 
@@ -361,28 +411,30 @@ def main():
     logging.error("Failed to read modules for {0}".format(args.minidump_file))
     sys.exit(1)
 
-  # Create a temporary directory to store the symbols.
-  # This automatically gets cleaned up.
-  with tempfile.TemporaryDirectory() as tmp_dir:
-    # Step 3: Dump symbols for all the modules into this temporary directory.
-    # Need both dump_syms and objcopy
-    dump_syms_bin = find_breakpad_binary("dump_syms")
-    if not dump_syms_bin:
-      logging.error("Could not find Breakpad dump_syms binary")
-      sys.exit(1)
-    objcopy_bin = find_objcopy_binary()
-    if not objcopy_bin:
-      logging.error("Could not find Binutils objcopy binary")
-      sys.exit(1)
-    dump_symbols_for_all_modules(dump_syms_bin, objcopy_bin, modules, tmp_dir)
-
-    # Step 4: Resolve the minidump with the temporary symbol directory
-    minidump_stackwalk_bin = find_breakpad_binary("minidump_stackwalk")
-    if not minidump_stackwalk_bin:
-      logging.error("Could not find Breakpad minidump_stackwalk binary")
-      sys.exit(1)
-    resolve_minidump(find_breakpad_binary("minidump_stackwalk"), args.minidump_file,
-                     tmp_dir, args.verbose, args.output_file)
+  # Step 3: Dump the symbols and use them to resolve the minidump
+  # Sometimes there are libraries with corrupt/problematic symbols
+  # that can cause minidump_stackwalk to go haywire and use excessive
+  # memory. First, we try using symbols from all of the shared libraries.
+  # If that fails, we fallback to using a "safe" list of shared libraries.
+  try:
+    # Dump the symbols and use them to resolve the minidump
+    dump_syms_and_resolve_stack(modules, args.minidump_file, args.output_file,
+                                args.verbose)
+    return
+  except Exception:
+    logging.warning("Encountered error: {0}".format(traceback.format_exc()))
+    logging.warning("Falling back to resolution using the safe library list")
+    logging.warning("Safe library list: {0}".format(args.safe_library_list))
+
+  # Limit the shared libraries to the "safe" list of shared libraries and
+  # try again.
+  if len(args.safe_library_list) == 0:
+    safe_library_list = []
+  else:
+    safe_library_list = args.safe_library_list.split(",")
+  safe_modules = filter_shared_library_modules(modules, safe_library_list)
+  dump_syms_and_resolve_stack(safe_modules, args.minidump_file, args.output_file,
+                              args.verbose)
 
 
 if __name__ == "__main__":

(impala) 03/04: IMPALA-11909: Use absolute path when calling resolve_minidumps.py

Posted by mi...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit fdd928563f2f5be05828bfeed24516449bb8f30b
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Wed Feb 15 16:36:56 2023 -0800

    IMPALA-11909: Use absolute path when calling resolve_minidumps.py
    
    If the bin/jenkins/finalize.sh script is called from a directory
    other than $IMPALA_HOME, it's call to resolve_minidumps.py will
    fail due to the relative path. This changes the call to use
    the absolute path so that finalize.sh works in this case.
    
    Testing:
     - Ran bin/jenkins/finalize.sh from a directory other than
       $IMPALA_HOME
    
    Change-Id: I063843554b52d3e8ed79ee32d9fd4c90d059c482
    Reviewed-on: http://gerrit.cloudera.org:8080/20801
    Reviewed-by: Michael Smith <mi...@cloudera.com>
    Tested-by: Joe McDonnell <jo...@cloudera.com>
---
 bin/jenkins/finalize.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/jenkins/finalize.sh b/bin/jenkins/finalize.sh
index 2057b7306..2c8db754d 100755
--- a/bin/jenkins/finalize.sh
+++ b/bin/jenkins/finalize.sh
@@ -67,7 +67,7 @@ if [[ $(find $LOGS_DIR -path "*minidumps*" -name "*dmp") ]]; then
   for minidump in $(find $LOGS_DIR -path "*minidumps*" -name "*dmp"); do
     # Since this is experimental, use it inside an if so that any error code doesn't
     # abort this script.
-    if ! bin/resolve_minidumps.py --minidump_file ${minidump} \
+    if ! "${IMPALA_HOME}"/bin/resolve_minidumps.py --minidump_file ${minidump} \
         --output_file ${minidump}_dumpedv2 ; then
       echo "bin/resolve_minidumps.py failed!"
     else

(impala) 01/04: IMPALA-12643 (part 1): Limit memory consumption for resolve_minidumps.py

Posted by mi...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c0a015fdac677c51b6d0e0f8511cc29eb2f8f304
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Fri Dec 15 16:55:32 2023 -0800

    IMPALA-12643 (part 1): Limit memory consumption for resolve_minidumps.py
    
    On some platforms (Centos 7), resolve_minidumps.py's call to
    minidump_stackwalk goes haywire and uses all the system memory
    until it gets OOM killed. Some library must have corrupt
    symbols, etc. As a workaround, this detects whether the
    prlimit utility is present and uses this to run minidump_stackwalk
    with a 4GB limit on virtual memory. This kills the process
    earlier and avoids using all system memory.
    
    Testing:
     - Verified that bin/jenkins/finalize.sh uses resolve_minidumps.py
       on a Redhat 8 Jenkins job (and it works)
     - Verified that bin/jenkins/finalize.sh works properly on
       my Ubuntu 20 box
     - Ran a Jenkins job on Centos 7 and verified that the prlimit
       code kills minidump_stackwalk when it uses 4GB of memory.
    
    Change-Id: I4db8facb8a037327228c3714e047e0d1f0fe1d94
    Reviewed-on: http://gerrit.cloudera.org:8080/20862
    Reviewed-by: Michael Smith <mi...@cloudera.com>
    Tested-by: Joe McDonnell <jo...@cloudera.com>
---
 bin/resolve_minidumps.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/bin/resolve_minidumps.py b/bin/resolve_minidumps.py
index e3f00be70..f410fabcf 100755
--- a/bin/resolve_minidumps.py
+++ b/bin/resolve_minidumps.py
@@ -32,7 +32,7 @@
 # that were used by the binary. It gets the symbols for all
 # those libraries and resolves the minidump.
 #
-# Usage: resolve_minidump.py --minidump_file [file] --output_file [file]
+# Usage: resolve_minidumps.py --minidump_file [file] --output_file [file]
 # (optional -v or --verbose for more output)
 
 import errno
@@ -288,9 +288,22 @@ def dump_symbols_for_all_modules(dump_syms, objcopy, module_list, out_dir):
 
 
 def resolve_minidump(minidump_stackwalk, minidump_path, symbol_dir, verbose, out_file):
+  minidump_stackwalk_cmd = [minidump_stackwalk, minidump_path, symbol_dir]
+  # There are circumstances where the minidump_stackwalk can go wrong and become
+  # a runaway process capable of using all system memory. If the prlimit utility
+  # is present, we use it to apply a limit on the memory consumption.
+  #
+  # See if we have the prlimit utility
+  check_prlimit = subprocess.run(["prlimit", "-V"], stdout=subprocess.DEVNULL,
+      stderr=subprocess.DEVNULL)
+  if check_prlimit.returncode == 0:
+    # The prlimit utility is available, so wrap the minidump_stackwalk command
+    # to apply a 4GB limit on virtual memory. In normal operations, 4G is plenty.
+    prlimit_wrapper = ["prlimit", "--as={0}".format(4 * 1024 * 1024 * 1024)]
+    minidump_stackwalk_cmd = prlimit_wrapper + minidump_stackwalk_cmd
   with open(out_file, "w") as out_f:
     stderr_output = None if verbose else subprocess.DEVNULL
-    subprocess.run([minidump_stackwalk, minidump_path, symbol_dir], stdout=out_f,
+    subprocess.run(minidump_stackwalk_cmd, stdout=out_f,
                    stderr=stderr_output, check=True)

(impala) 04/04: IMPALA-10048: Go parallel for dump_breakpad_symbols.py

Posted by mi...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3bcd770dfc51ed5148cf85a96f5644594f953319
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Sun Jun 25 21:20:25 2023 -0700

    IMPALA-10048: Go parallel for dump_breakpad_symbols.py
    
    This modifies dump_breakpad_symbols.py to use a ThreadPool
    to go parallel when there are multiple binaries or
    libraries to process. This is common for Jenkins jobs that
    dump symbols for all backend tests. The different binaries
    write out to different directories, so the threads don't
    interfere with each other.
    
    Testing:
     - Ran locally dumping the symbols for all backend tests
     - Ran a Jenkins job that generates a minidump and triggers
       the minidump symbol processing. It went parallel and
       worked fine.
    
    Change-Id: I93427bb07f1d9718bd6df90acfd247210b54294d
    Reviewed-on: http://gerrit.cloudera.org:8080/20802
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Michael Smith <mi...@cloudera.com>
---
 bin/dump_breakpad_symbols.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/bin/dump_breakpad_symbols.py b/bin/dump_breakpad_symbols.py
index 81bf00d54..da8485bd4 100755
--- a/bin/dump_breakpad_symbols.py
+++ b/bin/dump_breakpad_symbols.py
@@ -56,8 +56,8 @@
 from __future__ import absolute_import, division, print_function
 import errno
 import logging
-import glob
 import magic
+import multiprocessing
 import os
 import shutil
 import subprocess
@@ -66,6 +66,7 @@ import tempfile
 
 from argparse import ArgumentParser
 from collections import namedtuple
+from multiprocessing.pool import ThreadPool
 
 BinarySymbolInfo = namedtuple('BinarySymbolInfo', 'path, debug_path')
 
@@ -137,6 +138,8 @@ def parse_args():
   parser.add_argument('-s', '--symbol_pkg', '--debuginfo_rpm', help="""RPM/DEB file
       containing the debug symbols matching the binaries in -r""")
   parser.add_argument('--objcopy', help='Path to the objcopy binary from Binutils')
+  parser.add_argument('--num_processes', type=int, default=multiprocessing.cpu_count(),
+      help="Number of parallel processes to use.")
   args = parser.parse_args()
 
   # Post processing checks
@@ -341,9 +344,20 @@ def main():
   assert objcopy
   status = 0
   ensure_dir_exists(args.dest_dir)
-  for binary in enumerate_binaries(args):
-    if not process_binary(dump_syms, objcopy, binary, args.dest_dir):
+  # Use a thread pool to go parallel
+  thread_pool = ThreadPool(processes=args.num_processes)
+
+  def processing_fn(binary):
+    return process_binary(dump_syms, objcopy, binary, args.dest_dir)
+
+  for result in thread_pool.imap_unordered(processing_fn, enumerate_binaries(args)):
+    if not result:
+      thread_pool.terminate()
       status = 1
+      break
+
+  thread_pool.close()
+  thread_pool.join()
   sys.exit(status)