You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jy...@apache.org on 2016/09/15 23:58:53 UTC

[1/2] incubator-impala git commit: IMPALA-2013: Reintroduce steps for checking HBase health in run-hbase.sh

Repository: incubator-impala
Updated Branches:
  refs/heads/master c7fa03286 -> 9b3f43b9f


IMPALA-2013: Reintroduce steps for checking HBase health in run-hbase.sh

We used to include a step in run-hbase.sh for calling a python
script that queried Zookeeper to see if the HBase master was up.
The original script was problematic, so we stopped using it during
our mini-cluster HBase start up procedure.

HBase start up issues continue to plague us, however. This patch
reintroduces a Zookeeper check, with the following updates:

- replace the original script with check-hbase-nodes.py
- query the correct node /hbase/master, not just /hbase/rs
- use the python Zookeeper library kazoo, rather than calling
  out to the shell and parsing the return string
- since we are moving toward testing on a remote cluster, also
  add the capability to pass in the address for the host that
  provides the Zookeeper and HBase services
- add an additional check that the HDFS service is running,
  because of an edge case where the HBase master can briefly
  start without a cluster running.

In addition to the expected tests, this script was also tested
under the conditions of IMPALA-4088, whereby the HBase RegionServer
is running, but the master fails because another listening process
has already taken its TCP port (60010) during startup.

Change-Id: I9b81f3cfb6ea0ba7b18ce5fcd5d268f515c8b0c3
Reviewed-on: http://gerrit.cloudera.org:8080/4348
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a42d18dc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a42d18dc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a42d18dc

Branch: refs/heads/master
Commit: a42d18dcc316ac2dbc32edcc2774272f0bdf469c
Parents: c7fa032
Author: David Knupp <dk...@cloudera.com>
Authored: Thu Sep 8 16:42:44 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Thu Sep 15 00:02:22 2016 +0000

----------------------------------------------------------------------
 infra/python/deps/requirements.txt    |   1 +
 testdata/bin/check-hbase-nodes.py     | 174 +++++++++++++++++++++++++++++
 testdata/bin/run-hbase.sh             |   1 +
 testdata/bin/wait-for-hbase-master.py |  59 ----------
 4 files changed, 176 insertions(+), 59 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/infra/python/deps/requirements.txt
----------------------------------------------------------------------
diff --git a/infra/python/deps/requirements.txt b/infra/python/deps/requirements.txt
index 1c87f8e..b433713 100644
--- a/infra/python/deps/requirements.txt
+++ b/infra/python/deps/requirements.txt
@@ -54,6 +54,7 @@ impyla == 0.11.2
   # before thirdparty is built thrift will be installed anyways.
   thrift == 0.9.0
   thrift_sasl == 0.1.0
+kazoo == 2.2.1
 monkeypatch == 0.1rc3
 ordereddict == 1.1
 pexpect == 3.3

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/testdata/bin/check-hbase-nodes.py
----------------------------------------------------------------------
diff --git a/testdata/bin/check-hbase-nodes.py b/testdata/bin/check-hbase-nodes.py
new file mode 100755
index 0000000..999a657
--- /dev/null
+++ b/testdata/bin/check-hbase-nodes.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env impala-python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Given a series of hosts and Zookeeper nodes, make sure that each node is accessible.
+"""
+
+import argparse
+import hdfs
+import logging
+import pprint
+import requests
+import sys
+import time
+
+from contextlib import closing
+from kazoo.client import KazooClient
+from kazoo.exceptions import NoNodeError
+from kazoo.handlers.threading import KazooTimeoutError
+
+LOGGER = logging.getLogger('hbase_check')
+LOGGER.addHandler(logging.StreamHandler())
+LOGGER.setLevel(logging.INFO)
+
+TIMEOUT_SECONDS = 30
+
+HDFS_HOST = '127.0.0.1:5070'
+ZK_HOSTS = '127.0.0.1:2181'
+HBASE_NODES = ['/hbase/master', '/hbase/rs']
+ADMIN_USER = 'admin'
+
+
+def parse_args():
+    """Parse and return command line args."""
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--timeout', '-t', type=int, default=TIMEOUT_SECONDS,
+                        help=('Number of seconds to try to get znode before giving up. '
+                              'Default is {0} seconds.'.format(TIMEOUT_SECONDS)))
+
+    parser.add_argument('--hdfs_host', '-s', default=HDFS_HOST,
+                        help=('Host:port where the HDFS web host is running, '
+                              'e.g, 0.0.0.0:5070. Default is {0}.'.format(HDFS_HOST)))
+
+    parser.add_argument('--admin_user', '-u', default=ADMIN_USER,
+                        help='Cluster admin username. Default is {0}.'.format(ADMIN_USER))
+
+    parser.add_argument('--zookeeper_hosts', '-z', default=ZK_HOSTS,
+                        help=('Comma-delineated string of hosts in host:PORT format, '
+                              'e.g, 0.0.0.0:2181. Default is {0}.'.format(ZK_HOSTS)))
+
+    parser.add_argument('-node', '-n', action='append', dest='nodes',
+                        default=HBASE_NODES,
+                        help=('HBase znode to check. Can be specified multiple times. '
+                              'Defaults are -n {0}.'.format(' -n '.join(HBASE_NODES))))
+    return parser.parse_args()
+
+
+def connect_to_zookeeper(host_list, timeout_seconds):
+    """Connect to Zookeeper service.
+
+    Args:
+        host_list: Comma-separated string of hosts in host:port format
+        timeout_seconds: Number of seconds to attempt to connect to host
+
+    Returns:
+        KazooClient instance
+    """
+    zk_client = KazooClient(hosts=host_list)
+
+    try:
+        LOGGER.info("Connecting to Zookeeper host(s).")
+        zk_client.start(timeout=timeout_seconds)
+        LOGGER.info("Success: " + str(zk_client))
+        return zk_client
+    except KazooTimeoutError as e:
+        LOGGER.error("Could not connect to Zookeeper: " + str(e))
+        sys.exit(1)
+
+
+def check_znode(node, zk_client, timeout_seconds):
+    """Given a Zookeeper client and a node, check that the node is up.
+
+    Args:
+        node: name of a znode as a string, e.g., /hbase/rs
+        zk_client: Zookeeper client object
+        timeout_seconds: Number of seconds to attempt to get node
+
+    Returns:
+        0 success, 1 on failure
+    """
+    start_time = time.time()
+    while (time.time() - start_time) < timeout_seconds:
+        LOGGER.info("Waiting for HBase node: " + node)
+        try:
+            node_info = zk_client.get(node)
+            LOGGER.info("Success: " + node)
+            LOGGER.debug(pprint.pformat(node_info))
+            return 0
+        except NoNodeError:
+            time.sleep(1)
+
+    LOGGER.error("Failed while checking for HBase node: " + node)
+    return 1
+
+
+def check_znodes_list_for_errors(nodes, zookeeper_hosts, timeout):
+    """Confirm that the given list of znodes are responsive.
+
+    Args:
+        zk_client: Zookeeper client object
+        node: name of a znode as a string, e.g., /hbase/rs
+        timeout_seconds: Number of seconds to attempt to get node
+
+    Returns:
+        0 success, or else the number of unresponsive nodes
+    """
+    with closing(connect_to_zookeeper(zookeeper_hosts, timeout)) as zk_client:
+        errors = sum([check_znode(node, zk_client, timeout) for node in nodes])
+        zk_client.stop()
+    return errors
+
+
+def is_hdfs_running(host, admin_user):
+    """Confirm that HDFS is available.
+
+    There is a pathological case where the HBase master can start up briefly if HDFS is not
+    available, and then quit immediately, but that can be long enough to give a false positive
+    that the HBase master is running.
+
+    Args:
+        host: HDFS host:port
+        admin_user: Admin username
+
+    Returns:
+        Boolean
+    """
+    try:
+        hdfs_client = hdfs.InsecureClient('http://' + host, user=admin_user)
+        LOGGER.info("Contents of HDFS root: {0}".format(hdfs_client.list('/')))
+        return True
+    except (requests.exceptions.ConnectionError, hdfs.util.HdfsError) as e:
+        msg = 'Could not confirm HDFS is running at http://{0} - {1}'.format(host, e)
+        LOGGER.error(msg)
+        return False
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    if is_hdfs_running(args.hdfs_host, args.admin_user):
+        errors = check_znodes_list_for_errors(args.nodes, args.zookeeper_hosts, args.timeout)
+
+        if errors > 0:
+            msg = "Could not get one or more nodes. Exiting with errors: {0}".format(errors)
+            LOGGER.error(msg)
+            sys.exit(errors)
+    else:
+        sys.exit(1)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/testdata/bin/run-hbase.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-hbase.sh b/testdata/bin/run-hbase.sh
index 29ff77d..2a51105 100755
--- a/testdata/bin/run-hbase.sh
+++ b/testdata/bin/run-hbase.sh
@@ -126,4 +126,5 @@ for ((i=1; i <= HBASE_START_RETRY_ATTEMPTS; ++i)); do
   fi
 
 done
+${CLUSTER_BIN}/check-hbase-nodes.py
 echo "HBase startup scripts succeeded"

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/testdata/bin/wait-for-hbase-master.py
----------------------------------------------------------------------
diff --git a/testdata/bin/wait-for-hbase-master.py b/testdata/bin/wait-for-hbase-master.py
deleted file mode 100755
index 428949e..0000000
--- a/testdata/bin/wait-for-hbase-master.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env impala-python
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from subprocess import Popen, PIPE
-import os
-import re
-import sys
-import time
-
-now = time.time()
-TIMEOUT_SECONDS = 30.0
-ZK_CLASS="org.apache.zookeeper.ZooKeeperMain"
-
-print "Waiting for HBase Master"
-
-while time.time() - now < TIMEOUT_SECONDS:
-  sys.stdout.write(".")
-  sys.stdout.flush()
-
-  p = Popen([os.environ["JAVA"],
-             ZK_CLASS,
-             "-server",
-             "localhost:2181",
-             "get",
-             "/hbase/rs"], stderr=PIPE, stdout=PIPE)
-  out, err = p.communicate()
-  if re.match(".*" + ZK_CLASS + "\w*$", err):
-    print "Failure"
-    print err
-    print "CLASSPATH does not contain " + ZK_CLASS
-    print "Please check your CLASSPATH"
-    exit(1)
-
-  if "numChildren" in err:
-    print "Success"
-    print "HBase master is up, found in %2.1fs" % (time.time() - now,)
-    exit(0)
-
-  time.sleep(0.5)
-
-print "Failure"
-print "Hbase master did NOT write /hbase/rs in %2.1fs" % (time.time() - now,)
-exit(1)


[2/2] incubator-impala git commit: IMPALA-3912: test_random_rpc_timeout is flaky.

Posted by jy...@apache.org.
IMPALA-3912: test_random_rpc_timeout is flaky.

Datastream sender default timeout is 2 mins which could
block some fragments to complete until the timeout and
cause the metric "num-fragments-in-flight" not back to 0
after 60 seconds.
Decrease the sender timeout to 30 seconds and adding
some logging.

Change-Id: I19f8b3fea66c5a0398e3476a46f060be9f951983
Reviewed-on: http://gerrit.cloudera.org:8080/4080
Reviewed-by: Juan Yu <jy...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/9b3f43b9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/9b3f43b9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/9b3f43b9

Branch: refs/heads/master
Commit: 9b3f43b9fd006fe5d72281b0862abe42e0a0a061
Parents: a42d18d
Author: Juan Yu <jy...@cloudera.com>
Authored: Mon Aug 22 11:49:18 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Thu Sep 15 21:46:45 2016 +0000

----------------------------------------------------------------------
 be/src/service/fragment-mgr.cc           |  2 ++
 tests/custom_cluster/test_rpc_timeout.py | 11 +++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b3f43b9/be/src/service/fragment-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/service/fragment-mgr.cc b/be/src/service/fragment-mgr.cc
index 5ae5845..a98bbf9 100644
--- a/be/src/service/fragment-mgr.cc
+++ b/be/src/service/fragment-mgr.cc
@@ -100,6 +100,8 @@ void FragmentMgr::FragmentThread(TUniqueId fragment_instance_id) {
   // the fragment exec state.
   ImpaladMetrics::IMPALA_SERVER_NUM_FRAGMENTS_IN_FLIGHT->Increment(-1L);
 
+  VLOG_QUERY << "PlanFragment completed. instance_id=" << fragment_instance_id;
+
 #ifndef ADDRESS_SANITIZER
   // tcmalloc and address sanitizer can not be used together
   if (FLAGS_log_mem_usage_interval > 0) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b3f43b9/tests/custom_cluster/test_rpc_timeout.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_rpc_timeout.py b/tests/custom_cluster/test_rpc_timeout.py
index 76eb3d6..4ea5351 100644
--- a/tests/custom_cluster/test_rpc_timeout.py
+++ b/tests/custom_cluster/test_rpc_timeout.py
@@ -78,7 +78,8 @@ class TestRPCTimeout(CustomClusterTestSuite):
 
   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000"
-      " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=1")
+      " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=1"
+      " --datastream_sender_timeout_ms=30000")
   def test_execplanfragment_timeout(self, vector):
     for i in range(3):
       ex= self.execute_query_expect_failure(self.client, self.TEST_QUERY)
@@ -91,7 +92,8 @@ class TestRPCTimeout(CustomClusterTestSuite):
 
   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000"
-      " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=2")
+      " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=2"
+      " --datastream_sender_timeout_ms=30000")
   def test_cancelplanfragment_timeout(self, vector):
     query = "select * from tpch.lineitem limit 5000"
     self.execute_query_then_cancel(query, vector)
@@ -117,12 +119,13 @@ class TestRPCTimeout(CustomClusterTestSuite):
   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000"
       " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=6"
-      " --status_report_interval=1 ")
+      " --status_report_interval=1")
   def test_reportexecstatus_timeout(self, vector):
     self.execute_query_verify_metrics(self.TEST_QUERY)
 
   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000"
-      " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=7")
+      " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=7"
+      " --datastream_sender_timeout_ms=30000")
   def test_random_rpc_timeout(self, vector):
     self.execute_query_verify_metrics(self.TEST_QUERY, 10)