You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by lv...@apache.org on 2019/02/21 01:08:52 UTC

[impala] 01/06: IMPALA-8191: Wait for additional breakpad processes during test

This is an automated email from the ASF dual-hosted git repository.

lv pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c1274fafb04de1b9b7c3a17e209814b8c4346311
Author: Lars Volker <lv...@cloudera.com>
AuthorDate: Fri Feb 15 05:49:56 2019 -0800

    IMPALA-8191: Wait for additional breakpad processes during test
    
    The Breakpad signal handler forks off a process to write a minidump.
    During the breakpad tests we send signals to the Impala daemons and then
    wait for all processes to go away. Prior to this change we did this by
    waiting on the PID returned by process.get_pid(). It is determined by
    iterating over psutil.get_pid_list() which is an ordered list of PIDs
    running on the system. We return the first process in the list with a
    matching command line. In cases where the PID space rolled over, this
    could have been the forked off breakpad process and we'd wait on that
    one. During the subsequent check that all processes are indeed gone, we
    could then pick up the original Impala daemon that had forked off to
    write the minidump and was still in the process of shutting down.
    
    To fix this, we wait for every process twice. Processes are identified
    by their command and iterating through them twice makes sure we catch
    both the original daemon and it's breakpad child.
    
    This change also contains improvements to the logging of processes in
    our tests. This should make it easier to identify similar issues in the
    future.
    
    Testing: I ran the breakpad tests in exhaustive mode. I didn't try to
    exercise it around a PID roll-over, but we shouldn't see the issue in
    IMPALA-8191 again.
    
    Change-Id: Ia4dcc5fecb9b5f38ae1504aae40f099837cf1bca
    Reviewed-on: http://gerrit.cloudera.org:8080/12501
    Reviewed-by: Lars Volker <lv...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/common/impala_cluster.py        | 63 +++++++++++++++++++++++++----------
 tests/custom_cluster/test_breakpad.py | 14 +++++---
 2 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/tests/common/impala_cluster.py b/tests/common/impala_cluster.py
index 751c792..1596741 100644
--- a/tests/common/impala_cluster.py
+++ b/tests/common/impala_cluster.py
@@ -287,39 +287,71 @@ class Process(object):
     self.container_id = container_id
     self.port_map = port_map
 
+  def __class_name(self):
+    return self.__class__.__name__
+
+  def __str__(self):
+    return "<%s PID: %s (%s)>" % (self.__class_name(), self.__get_pid(),
+                                  ' '.join(self.cmd))
+
+  def __repr__(self):
+    return str(self)
+
   def get_pid(self):
     """Gets the PID of the process. Returns None if the PID cannot be determined"""
-    LOG.info("Attempting to find PID for %s" % ' '.join(self.cmd))
-    return self.__get_pid()
+    pid = self.__get_pid()
+    if pid:
+      LOG.info("Found PID %s for %s" % (pid, " ".join(self.cmd)))
+    else:
+      LOG.info("No PID found for process cmdline: %s. Process is dead?" %
+               " ".join(self.cmd))
+    return pid
+
+  def get_pids(self):
+    """Gets the PIDs of the process. In some circumstances, a process can run multiple
+    times, e.g. when it forks in the Breakpad crash handler. Returns an empty list if no
+    PIDs can be determined."""
+    pids = self.__get_pids()
+    if pids:
+      LOG.info("Found PIDs %s for %s" % (", ".join(map(str, pids)), " ".join(self.cmd)))
+    else:
+      LOG.info("No PID found for process cmdline: %s. Process is dead?" %
+               " ".join(self.cmd))
+    return pids
 
   def __get_pid(self):
+    pids = self.__get_pids()
+    assert len(pids) < 2, "Expected single pid but found %s" % ", ".join(map(str, pids))
+    return len(pids) == 1 and pids[0] or None
+
+  def __get_pids(self):
     if self.container_id is not None:
       container_info = self._get_container_info(self.container_id)
       if container_info["State"]["Status"] != "running":
-        return None
-      return container_info["State"]["Status"]["Pid"]
+        return []
+      return [container_info["State"]["Status"]["Pid"]]
 
     # In non-containerised case, search for process based on matching command lines.
+    pids = []
     for pid in psutil.get_pid_list():
       try:
         process = psutil.Process(pid)
         if set(self.cmd) == set(process.cmdline):
-          return pid
-      except psutil.NoSuchProcess, e:
-        # A process from get_pid_list() no longer exists, continue.
-        LOG.info(e)
-    LOG.info("No PID found for process cmdline: %s. Process is dead?" % self.cmd)
-    return None
+          pids.append(pid)
+      except psutil.NoSuchProcess:
+        # A process from get_pid_list() no longer exists, continue. We don't log this
+        # error since it can refer to arbitrary processes outside of our testing code.
+        pass
+    return pids
 
   def kill(self, signal=SIGKILL):
     """
     Kills the given processes.
     """
     if self.container_id is None:
-      pid = self.get_pid()
-      if pid is None:
-        assert 0, "No processes %s found" % self.cmd
-      LOG.info('Killing: %s (PID: %d) with signal %s' % (' '.join(self.cmd), pid, signal))
+      pid = self.__get_pid()
+      assert pid is not None, "No processes for %s" % self
+      LOG.info('Killing %s with signal %s' % (self, signal))
       exec_process("kill -%d %d" % (signal, pid))
     else:
       LOG.info("Stopping container: {0}".format(self.container_id))
@@ -349,9 +381,6 @@ class Process(object):
     while self.__get_pid() is not None:
       sleep(0.01)
 
-  def __str__(self):
-    return "Command: %s PID: %s" % (self.cmd, self.get_pid())
-
 
 # Base class for all Impala processes
 class BaseImpalaProcess(Process):
diff --git a/tests/custom_cluster/test_breakpad.py b/tests/custom_cluster/test_breakpad.py
index 91f79d9..1c2d6bf 100644
--- a/tests/custom_cluster/test_breakpad.py
+++ b/tests/custom_cluster/test_breakpad.py
@@ -88,11 +88,15 @@ class TestBreakpadBase(CustomClusterTestSuite):
   def wait_for_all_processes_dead(self, processes, timeout=300):
     for process in processes:
       try:
-        pid = process.get_pid()
-        if not pid:
-          continue
-        psutil_process = psutil.Process(pid)
-        psutil_process.wait(timeout)
+        # For every process in the list we might see the original Impala process plus a
+        # forked off child that is writing the minidump. We need to catch both.
+        for pid in process.get_pids():
+          print "Checking pid %s" % pid
+          psutil_process = psutil.Process(pid)
+          psutil_process.wait(timeout)
+      except psutil.NoSuchProcess:
+        # Process has exited in the meantime
+        pass
       except psutil.TimeoutExpired:
         raise RuntimeError("Unable to kill %s (pid %d) after %d seconds." %
             (psutil_process.name, psutil_process.pid, timeout))