You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by jo...@apache.org on 2015/06/02 19:48:26 UTC
ambari git commit: AMBARI-11624 - Datanode Shutdown Retries During
Upgrade Are Too Long (jonathanhurley)
Repository: ambari
Updated Branches:
refs/heads/trunk b6c115ba2 -> b40d808d3
AMBARI-11624 - Datanode Shutdown Retries During Upgrade Are Too Long (jonathanhurley)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/b40d808d
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/b40d808d
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/b40d808d
Branch: refs/heads/trunk
Commit: b40d808d3a4ee9c7855532e555d89dd5910301f3
Parents: b6c115b
Author: Jonathan Hurley <jh...@hortonworks.com>
Authored: Tue Jun 2 10:29:53 2015 -0400
Committer: Jonathan Hurley <jh...@hortonworks.com>
Committed: Tue Jun 2 13:48:03 2015 -0400
----------------------------------------------------------------------
.../package/scripts/datanode_upgrade.py | 13 ++++++++--
.../python/stacks/2.0.6/HDFS/test_datanode.py | 27 ++++++++++++++++++++
2 files changed, 38 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/b40d808d/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
index 529ca4438..29af5bd 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
@@ -62,18 +62,27 @@ def post_upgrade_check():
_check_datanode_startup()
-@retry(times=12, sleep_time=10, err_class=Fail)
+@retry(times=24, sleep_time=5, err_class=Fail)
def _check_datanode_shutdown():
"""
Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo"
several times, pausing in between runs. Once the DataNode stops responding
this method will return, otherwise it will raise a Fail(...) and retry
automatically.
+ The stack defaults for retrying for HDFS are also way too slow for this
+ command; they are set to wait about 45 seconds between client retries. As
+ a result, a single execution of dfsadmin will take 45 seconds to retry and
+ the DataNode may be marked as dead, causing problems with HBase.
+ https://issues.apache.org/jira/browse/HDFS-8510 tracks reducing the
+ times for ipc.client.connect.retry.interval. In the meantime, override them
+ here, but only for RU.
:return:
"""
import params
- command = format('hdfs dfsadmin -getDatanodeInfo {dfs_dn_ipc_address}')
+ # override stock retry timeouts since after 30 seconds, the datanode is
+ # marked as dead and can affect HBase during RU
+ command = format('hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo {dfs_dn_ipc_address}')
try:
Execute(command, user=params.hdfs_user, tries=1)
http://git-wip-us.apache.org/repos/asf/ambari/blob/b40d808d/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
index a310bf4..2440145 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
@@ -544,6 +544,33 @@ class TestDatanode(RMFTestCase):
self.assertTrue(mocks_dict['call'].called)
self.assertEqual(mocks_dict['call'].call_count,12)
+
+ @patch('time.sleep')
+ def test_stop_during_upgrade(self, time_mock):
+ config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
+ with open(config_file, "r") as f:
+ json_content = json.load(f)
+
+ version = '2.2.1.0-3242'
+ json_content['commandParams']['version'] = version
+
+ try:
+ self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/datanode.py",
+ classname = "DataNode",
+ command = "stop",
+ config_dict = json_content,
+ hdp_stack_version = self.STACK_VERSION,
+ target = RMFTestCase.TARGET_COMMON_SERVICES,
+ command_args=[True])
+
+ raise Fail("Expected a fail since datanode didn't report a shutdown")
+ except:
+ pass
+
+ self.assertResourceCalled('Execute', 'hdfs dfsadmin -shutdownDatanode 0.0.0.0:8010 upgrade', user="hdfs", tries=1)
+ self.assertResourceCalled('Execute', 'hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010', user="hdfs", tries=1)
+
+
@patch("resource_management.libraries.functions.security_commons.build_expectations")
@patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem")
@patch("resource_management.libraries.functions.security_commons.validate_security_config_properties")