You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jy...@apache.org on 2016/09/15 23:58:53 UTC
[1/2] incubator-impala git commit: IMPALA-2013: Reintroduce steps for
checking HBase health in run-hbase.sh
Repository: incubator-impala
Updated Branches:
refs/heads/master c7fa03286 -> 9b3f43b9f
IMPALA-2013: Reintroduce steps for checking HBase health in run-hbase.sh
We used to include a step in run-hbase.sh for calling a python
script that queried Zookeeper to see if the HBase master was up.
The original script was problematic, so we stopped using it during
our mini-cluster HBase start up procedure.
HBase start up issues continue to plague us, however. This patch
reintroduces a Zookeeper check, with the following updates:
- replace the original script with check-hbase-nodes.py
- query the correct node /hbase/master, not just /hbase/rs
- use the python Zookeeper library kazoo, rather than calling
out to the shell and parsing the return string
- since we are moving toward testing on a remote cluster, also
add the capability to pass in the address for the host that
provides the Zookeeper and HBase services
- add an additional check that the HDFS service is running,
because of an edge case where the HBase master can briefly
start without a cluster running.
In addition to the expected tests, this script was also tested
under the conditions of IMPALA-4088, whereby the HBase RegionServer
is running, but the master fails because another listening process
has already taken its TCP port (60010) during startup.
Change-Id: I9b81f3cfb6ea0ba7b18ce5fcd5d268f515c8b0c3
Reviewed-on: http://gerrit.cloudera.org:8080/4348
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a42d18dc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a42d18dc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a42d18dc
Branch: refs/heads/master
Commit: a42d18dcc316ac2dbc32edcc2774272f0bdf469c
Parents: c7fa032
Author: David Knupp <dk...@cloudera.com>
Authored: Thu Sep 8 16:42:44 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Thu Sep 15 00:02:22 2016 +0000
----------------------------------------------------------------------
infra/python/deps/requirements.txt | 1 +
testdata/bin/check-hbase-nodes.py | 174 +++++++++++++++++++++++++++++
testdata/bin/run-hbase.sh | 1 +
testdata/bin/wait-for-hbase-master.py | 59 ----------
4 files changed, 176 insertions(+), 59 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/infra/python/deps/requirements.txt
----------------------------------------------------------------------
diff --git a/infra/python/deps/requirements.txt b/infra/python/deps/requirements.txt
index 1c87f8e..b433713 100644
--- a/infra/python/deps/requirements.txt
+++ b/infra/python/deps/requirements.txt
@@ -54,6 +54,7 @@ impyla == 0.11.2
# before thirdparty is built thrift will be installed anyways.
thrift == 0.9.0
thrift_sasl == 0.1.0
+kazoo == 2.2.1
monkeypatch == 0.1rc3
ordereddict == 1.1
pexpect == 3.3
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/testdata/bin/check-hbase-nodes.py
----------------------------------------------------------------------
diff --git a/testdata/bin/check-hbase-nodes.py b/testdata/bin/check-hbase-nodes.py
new file mode 100755
index 0000000..999a657
--- /dev/null
+++ b/testdata/bin/check-hbase-nodes.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env impala-python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Given a series of hosts and Zookeeper nodes, make sure that each node is accessible.
+"""
+
+import argparse
+import hdfs
+import logging
+import pprint
+import requests
+import sys
+import time
+
+from contextlib import closing
+from kazoo.client import KazooClient
+from kazoo.exceptions import NoNodeError
+from kazoo.handlers.threading import KazooTimeoutError
+
+LOGGER = logging.getLogger('hbase_check')
+LOGGER.addHandler(logging.StreamHandler())
+LOGGER.setLevel(logging.INFO)
+
+TIMEOUT_SECONDS = 30
+
+HDFS_HOST = '127.0.0.1:5070'
+ZK_HOSTS = '127.0.0.1:2181'
+HBASE_NODES = ['/hbase/master', '/hbase/rs']
+ADMIN_USER = 'admin'
+
+
+def parse_args():
+ """Parse and return command line args."""
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument('--timeout', '-t', type=int, default=TIMEOUT_SECONDS,
+ help=('Number of seconds to try to get znode before giving up. '
+ 'Default is {0} seconds.'.format(TIMEOUT_SECONDS)))
+
+ parser.add_argument('--hdfs_host', '-s', default=HDFS_HOST,
+ help=('Host:port where the HDFS web host is running, '
+ 'e.g, 0.0.0.0:5070. Default is {0}.'.format(HDFS_HOST)))
+
+ parser.add_argument('--admin_user', '-u', default=ADMIN_USER,
+ help='Cluster admin username. Default is {0}.'.format(ADMIN_USER))
+
+ parser.add_argument('--zookeeper_hosts', '-z', default=ZK_HOSTS,
+ help=('Comma-delineated string of hosts in host:PORT format, '
+ 'e.g, 0.0.0.0:2181. Default is {0}.'.format(ZK_HOSTS)))
+
+ parser.add_argument('-node', '-n', action='append', dest='nodes',
+ default=HBASE_NODES,
+ help=('HBase znode to check. Can be specified multiple times. '
+ 'Defaults are -n {0}.'.format(' -n '.join(HBASE_NODES))))
+ return parser.parse_args()
+
+
+def connect_to_zookeeper(host_list, timeout_seconds):
+ """Connect to Zookeeper service.
+
+ Args:
+ host_list: Comma-separated string of hosts in host:port format
+ timeout_seconds: Number of seconds to attempt to connect to host
+
+ Returns:
+ KazooClient instance
+ """
+ zk_client = KazooClient(hosts=host_list)
+
+ try:
+ LOGGER.info("Connecting to Zookeeper host(s).")
+ zk_client.start(timeout=timeout_seconds)
+ LOGGER.info("Success: " + str(zk_client))
+ return zk_client
+ except KazooTimeoutError as e:
+ LOGGER.error("Could not connect to Zookeeper: " + str(e))
+ sys.exit(1)
+
+
+def check_znode(node, zk_client, timeout_seconds):
+ """Given a Zookeeper client and a node, check that the node is up.
+
+ Args:
+ node: name of a znode as a string, e.g., /hbase/rs
+ zk_client: Zookeeper client object
+ timeout_seconds: Number of seconds to attempt to get node
+
+ Returns:
+ 0 success, 1 on failure
+ """
+ start_time = time.time()
+ while (time.time() - start_time) < timeout_seconds:
+ LOGGER.info("Waiting for HBase node: " + node)
+ try:
+ node_info = zk_client.get(node)
+ LOGGER.info("Success: " + node)
+ LOGGER.debug(pprint.pformat(node_info))
+ return 0
+ except NoNodeError:
+ time.sleep(1)
+
+ LOGGER.error("Failed while checking for HBase node: " + node)
+ return 1
+
+
+def check_znodes_list_for_errors(nodes, zookeeper_hosts, timeout):
+ """Confirm that the given list of znodes are responsive.
+
+ Args:
+ zk_client: Zookeeper client object
+ node: name of a znode as a string, e.g., /hbase/rs
+ timeout_seconds: Number of seconds to attempt to get node
+
+ Returns:
+ 0 success, or else the number of unresponsive nodes
+ """
+ with closing(connect_to_zookeeper(zookeeper_hosts, timeout)) as zk_client:
+ errors = sum([check_znode(node, zk_client, timeout) for node in nodes])
+ zk_client.stop()
+ return errors
+
+
+def is_hdfs_running(host, admin_user):
+ """Confirm that HDFS is available.
+
+ There is a pathological case where the HBase master can start up briefly if HDFS is not
+ available, and then quit immediately, but that can be long enough to give a false positive
+ that the HBase master is running.
+
+ Args:
+ host: HDFS host:port
+ admin_user: Admin username
+
+ Returns:
+ Boolean
+ """
+ try:
+ hdfs_client = hdfs.InsecureClient('http://' + host, user=admin_user)
+ LOGGER.info("Contents of HDFS root: {0}".format(hdfs_client.list('/')))
+ return True
+ except (requests.exceptions.ConnectionError, hdfs.util.HdfsError) as e:
+ msg = 'Could not confirm HDFS is running at http://{0} - {1}'.format(host, e)
+ LOGGER.error(msg)
+ return False
+
+
+if __name__ == "__main__":
+ args = parse_args()
+
+ if is_hdfs_running(args.hdfs_host, args.admin_user):
+ errors = check_znodes_list_for_errors(args.nodes, args.zookeeper_hosts, args.timeout)
+
+ if errors > 0:
+ msg = "Could not get one or more nodes. Exiting with errors: {0}".format(errors)
+ LOGGER.error(msg)
+ sys.exit(errors)
+ else:
+ sys.exit(1)
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/testdata/bin/run-hbase.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-hbase.sh b/testdata/bin/run-hbase.sh
index 29ff77d..2a51105 100755
--- a/testdata/bin/run-hbase.sh
+++ b/testdata/bin/run-hbase.sh
@@ -126,4 +126,5 @@ for ((i=1; i <= HBASE_START_RETRY_ATTEMPTS; ++i)); do
fi
done
+${CLUSTER_BIN}/check-hbase-nodes.py
echo "HBase startup scripts succeeded"
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/testdata/bin/wait-for-hbase-master.py
----------------------------------------------------------------------
diff --git a/testdata/bin/wait-for-hbase-master.py b/testdata/bin/wait-for-hbase-master.py
deleted file mode 100755
index 428949e..0000000
--- a/testdata/bin/wait-for-hbase-master.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env impala-python
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from subprocess import Popen, PIPE
-import os
-import re
-import sys
-import time
-
-now = time.time()
-TIMEOUT_SECONDS = 30.0
-ZK_CLASS="org.apache.zookeeper.ZooKeeperMain"
-
-print "Waiting for HBase Master"
-
-while time.time() - now < TIMEOUT_SECONDS:
- sys.stdout.write(".")
- sys.stdout.flush()
-
- p = Popen([os.environ["JAVA"],
- ZK_CLASS,
- "-server",
- "localhost:2181",
- "get",
- "/hbase/rs"], stderr=PIPE, stdout=PIPE)
- out, err = p.communicate()
- if re.match(".*" + ZK_CLASS + "\w*$", err):
- print "Failure"
- print err
- print "CLASSPATH does not contain " + ZK_CLASS
- print "Please check your CLASSPATH"
- exit(1)
-
- if "numChildren" in err:
- print "Success"
- print "HBase master is up, found in %2.1fs" % (time.time() - now,)
- exit(0)
-
- time.sleep(0.5)
-
-print "Failure"
-print "Hbase master did NOT write /hbase/rs in %2.1fs" % (time.time() - now,)
-exit(1)
[2/2] incubator-impala git commit: IMPALA-3912:
test_random_rpc_timeout is flaky.
Posted by jy...@apache.org.
IMPALA-3912: test_random_rpc_timeout is flaky.
Datastream sender default timeout is 2 mins which could
block some fragments to complete until the timeout and
cause the metric "num-fragments-in-flight" not back to 0
after 60 seconds.
Decrease the sender timeout to 30 seconds and adding
some logging.
Change-Id: I19f8b3fea66c5a0398e3476a46f060be9f951983
Reviewed-on: http://gerrit.cloudera.org:8080/4080
Reviewed-by: Juan Yu <jy...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/9b3f43b9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/9b3f43b9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/9b3f43b9
Branch: refs/heads/master
Commit: 9b3f43b9fd006fe5d72281b0862abe42e0a0a061
Parents: a42d18d
Author: Juan Yu <jy...@cloudera.com>
Authored: Mon Aug 22 11:49:18 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Thu Sep 15 21:46:45 2016 +0000
----------------------------------------------------------------------
be/src/service/fragment-mgr.cc | 2 ++
tests/custom_cluster/test_rpc_timeout.py | 11 +++++++----
2 files changed, 9 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b3f43b9/be/src/service/fragment-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/service/fragment-mgr.cc b/be/src/service/fragment-mgr.cc
index 5ae5845..a98bbf9 100644
--- a/be/src/service/fragment-mgr.cc
+++ b/be/src/service/fragment-mgr.cc
@@ -100,6 +100,8 @@ void FragmentMgr::FragmentThread(TUniqueId fragment_instance_id) {
// the fragment exec state.
ImpaladMetrics::IMPALA_SERVER_NUM_FRAGMENTS_IN_FLIGHT->Increment(-1L);
+ VLOG_QUERY << "PlanFragment completed. instance_id=" << fragment_instance_id;
+
#ifndef ADDRESS_SANITIZER
// tcmalloc and address sanitizer can not be used together
if (FLAGS_log_mem_usage_interval > 0) {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b3f43b9/tests/custom_cluster/test_rpc_timeout.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_rpc_timeout.py b/tests/custom_cluster/test_rpc_timeout.py
index 76eb3d6..4ea5351 100644
--- a/tests/custom_cluster/test_rpc_timeout.py
+++ b/tests/custom_cluster/test_rpc_timeout.py
@@ -78,7 +78,8 @@ class TestRPCTimeout(CustomClusterTestSuite):
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000"
- " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=1")
+ " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=1"
+ " --datastream_sender_timeout_ms=30000")
def test_execplanfragment_timeout(self, vector):
for i in range(3):
ex= self.execute_query_expect_failure(self.client, self.TEST_QUERY)
@@ -91,7 +92,8 @@ class TestRPCTimeout(CustomClusterTestSuite):
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000"
- " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=2")
+ " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=2"
+ " --datastream_sender_timeout_ms=30000")
def test_cancelplanfragment_timeout(self, vector):
query = "select * from tpch.lineitem limit 5000"
self.execute_query_then_cancel(query, vector)
@@ -117,12 +119,13 @@ class TestRPCTimeout(CustomClusterTestSuite):
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000"
" --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=6"
- " --status_report_interval=1 ")
+ " --status_report_interval=1")
def test_reportexecstatus_timeout(self, vector):
self.execute_query_verify_metrics(self.TEST_QUERY)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000"
- " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=7")
+ " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=7"
+ " --datastream_sender_timeout_ms=30000")
def test_random_rpc_timeout(self, vector):
self.execute_query_verify_metrics(self.TEST_QUERY, 10)