You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/04/09 05:28:03 UTC

[1/2] incubator-impala git commit: IMPALA-3794: Workaround for Breakpad ID conflicts

Repository: incubator-impala
Updated Branches:
  refs/heads/master 4a79c9e7e -> 8146532b6


IMPALA-3794: Workaround for Breakpad ID conflicts

Breakpad determines the ID of the minidump file to be written in case of
a crash during startup of the process randomly, seeded with the current
system time with second granularity. If two impalads start up within the
same second, there is a chance for a name conflict. The one second delay
between starting impalads in start-impala-cluster.py is not sufficient:

I0407 22:34:52.018563 28473 minidump.cc:245] Setting minidump size limit
to 20971520.
I0407 22:34:52.997046 28749 minidump.cc:245] Setting minidump size limit
to 20971520.

When sending a signal to all of them, one process can overwrite the
minidump of another one. This is an upstream issue and is tracked in
Breakpad-681. I further confirmed my suspicion by tentatively making an
own output folder for each running instance of impalad and was then
unable to reproduce the issue. However, it is a more clear solution to
fix the underlying issue than to change the folder locations for
minidumps in impala.

Until this is fixed upstream, we can make sure that we see at least one
minidump for the group of impalads in the test cluster. It is not a
product defect, since we don't support running multiple impalads on a
single host, let alone starting them all at once.

To test this I ran the following loop for about an hour on my dev
machine without hitting the issue:

while [ $? -eq 0 ]; do impala-py.test
tests/custom_cluster/test_breakpad.py --exploration_strategy=exhaustive
-k test_minidump_relative_path -x -s; done

Change-Id: I4ae589f6eb5cbbfb860943214edc0e6415eeb862
Reviewed-on: http://gerrit.cloudera.org:8080/6588
Reviewed-by: Lars Volker <lv...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a827e9ed
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a827e9ed
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a827e9ed

Branch: refs/heads/master
Commit: a827e9edc1f93ec0ddb2184133b114182ebb2e02
Parents: 4a79c9e
Author: Lars Volker <lv...@cloudera.com>
Authored: Fri Apr 7 13:12:34 2017 +0200
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Apr 8 19:58:25 2017 +0000

----------------------------------------------------------------------
 tests/custom_cluster/test_breakpad.py | 37 ++++++++++++++++++------------
 1 file changed, 22 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a827e9ed/tests/custom_cluster/test_breakpad.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_breakpad.py b/tests/custom_cluster/test_breakpad.py
index 834b969..bf480e9 100644
--- a/tests/custom_cluster/test_breakpad.py
+++ b/tests/custom_cluster/test_breakpad.py
@@ -37,6 +37,9 @@ class TestBreakpad(CustomClusterTestSuite):
   writing minidump files on unhandled signals and rotating old minidumps on startup. The
   tests kill the daemons by sending a SIGSEGV signal.
   """
+  # Limit for the number of minidumps that gets passed to the daemons as a startup flag.
+  MAX_MINIDUMPS = 2
+
   @classmethod
   def get_workload(cls):
     return 'functional-query'
@@ -77,10 +80,11 @@ class TestBreakpad(CustomClusterTestSuite):
     self._start_impala_cluster(cluster_options)
 
   def start_cluster(self):
-    self.start_cluster_with_args(minidump_path=self.tmp_dir, max_minidumps=2)
+    self.start_cluster_with_args(minidump_path=self.tmp_dir,
+                                 max_minidumps=self.MAX_MINIDUMPS)
 
   def start_cluster_without_minidumps(self):
-    self.start_cluster_with_args(minidump_path='', max_minidumps=2)
+    self.start_cluster_with_args(minidump_path='', max_minidumps=self.MAX_MINIDUMPS)
 
   def kill_cluster(self, signal):
     self.cluster.refresh()
@@ -139,6 +143,18 @@ class TestBreakpad(CustomClusterTestSuite):
   def count_all_minidumps(self, base_dir=None):
     return sum((self.count_minidumps(daemon, base_dir) for daemon in DAEMONS))
 
+  def assert_num_minidumps_for_all_daemons(self, base_dir=None):
+    self.assert_num_logfile_entries(1)
+    # IMPALA-3794 / Breakpad-681: Weak minidump ID generation can lead to name conflicts,
+    # so that one process overwrites the minidump of others. See IMPALA-3794 for more
+    # information.
+    # TODO: Change this here and elsewhere in this file to expect 'cluster_size' minidumps
+    # once Breakpad-681 has been fixed.
+    assert self.count_minidumps('impalad', base_dir) >= 1
+    assert self.count_minidumps('statestored', base_dir) == 1
+    assert self.count_minidumps('catalogd', base_dir) == 1
+
+
   def assert_num_logfile_entries(self, expected_count):
     self.assert_impalad_log_contains('INFO', 'Wrote minidump to ',
         expected_count=expected_count)
@@ -153,10 +169,7 @@ class TestBreakpad(CustomClusterTestSuite):
     assert self.count_all_minidumps() == 0
     cluster_size = self.get_num_processes('impalad')
     self.kill_cluster(SIGSEGV)
-    self.assert_num_logfile_entries(1)
-    assert self.count_minidumps('impalad') == cluster_size
-    assert self.count_minidumps('statestored') == 1
-    assert self.count_minidumps('catalogd') == 1
+    self.assert_num_minidumps_for_all_daemons()
 
   @pytest.mark.execute_serially
   def test_sigusr1_writes_minidump(self):
@@ -175,10 +188,7 @@ class TestBreakpad(CustomClusterTestSuite):
     self.execute_query_expect_success(client, "SELECT COUNT(*) FROM functional.alltypes")
     # Kill the cluster. Sending SIGKILL will not trigger minidumps to be written.
     self.kill_cluster(SIGKILL)
-    self.assert_num_logfile_entries(1)
-    assert self.count_minidumps('impalad') == cluster_size
-    assert self.count_minidumps('statestored') == 1
-    assert self.count_minidumps('catalogd') == 1
+    self.assert_num_minidumps_for_all_daemons()
 
   @pytest.mark.execute_serially
   def test_minidump_relative_path(self):
@@ -193,10 +203,7 @@ class TestBreakpad(CustomClusterTestSuite):
     assert self.count_all_minidumps(minidump_base_dir) == 0
     cluster_size = self.get_num_processes('impalad')
     self.kill_cluster(SIGSEGV)
-    self.assert_num_logfile_entries(1)
-    assert self.count_minidumps('impalad', minidump_base_dir) == cluster_size
-    assert self.count_minidumps('statestored', minidump_base_dir) == 1
-    assert self.count_minidumps('catalogd', minidump_base_dir) == 1
+    self.assert_num_minidumps_for_all_daemons(minidump_base_dir)
     shutil.rmtree(minidump_base_dir)
 
   @pytest.mark.execute_serially
@@ -207,7 +214,7 @@ class TestBreakpad(CustomClusterTestSuite):
     self.kill_cluster(SIGSEGV)
     self.assert_num_logfile_entries(1)
     self.start_cluster()
-    expected_impalads = min(self.get_num_processes('impalad'), 2)
+    expected_impalads = min(self.get_num_processes('impalad'), self.MAX_MINIDUMPS)
     assert self.count_minidumps('impalad') == expected_impalads
     assert self.count_minidumps('statestored') == 1
     assert self.count_minidumps('catalogd') == 1


[2/2] incubator-impala git commit: IMPALA-5183: increase write wait timeout in BufferedBlockMgrTest

Posted by ta...@apache.org.
IMPALA-5183: increase write wait timeout in BufferedBlockMgrTest

The timeout was set low enough that some kind of hardware/system
hiccup had a reasonable chance of causing occasional failures.
Bumped it to 10s - that should only fail if something is seriously
wrong.

Change-Id: Ibbe81eda916730445ca5cf667480ee0cf6b5d429
Reviewed-on: http://gerrit.cloudera.org:8080/6595
Reviewed-by: Matthew Jacobs <mj...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/8146532b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/8146532b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/8146532b

Branch: refs/heads/master
Commit: 8146532b640eef012623a4ef107743f1bfc41cdd
Parents: a827e9e
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Fri Apr 7 14:47:12 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Apr 8 23:09:37 2017 +0000

----------------------------------------------------------------------
 be/src/runtime/buffered-block-mgr-test.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8146532b/be/src/runtime/buffered-block-mgr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-block-mgr-test.cc b/be/src/runtime/buffered-block-mgr-test.cc
index bd60a2c..65fd168 100644
--- a/be/src/runtime/buffered-block-mgr-test.cc
+++ b/be/src/runtime/buffered-block-mgr-test.cc
@@ -61,8 +61,9 @@ const string SCRATCH_DIR = "/tmp/impala-scratch";
 // This suffix is appended to a tmp dir
 const string SCRATCH_SUFFIX = "/impala-scratch";
 
-// Number of millieconds to wait to ensure write completes
-const static int WRITE_WAIT_MILLIS = 500;
+// Number of millieconds to wait to ensure write completes. We don't know for sure how
+// slow the disk will be, so this is much higher than we expect the writes to take.
+const static int WRITE_WAIT_MILLIS = 10000;
 
 // How often to check for write completion
 const static int WRITE_CHECK_INTERVAL_MILLIS = 10;