You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by jo...@apache.org on 2015/06/02 19:48:26 UTC
ambari git commit: AMBARI-11624 - Datanode Shutdown Retries During Upgrade Are Too Long (jonathanhurley)

Repository: ambari
Updated Branches:
  refs/heads/trunk b6c115ba2 -> b40d808d3


AMBARI-11624 - Datanode Shutdown Retries During Upgrade Are Too Long (jonathanhurley)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/b40d808d
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/b40d808d
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/b40d808d

Branch: refs/heads/trunk
Commit: b40d808d3a4ee9c7855532e555d89dd5910301f3
Parents: b6c115b
Author: Jonathan Hurley <jh...@hortonworks.com>
Authored: Tue Jun 2 10:29:53 2015 -0400
Committer: Jonathan Hurley <jh...@hortonworks.com>
Committed: Tue Jun 2 13:48:03 2015 -0400

----------------------------------------------------------------------
 .../package/scripts/datanode_upgrade.py         | 13 ++++++++--
 .../python/stacks/2.0.6/HDFS/test_datanode.py   | 27 ++++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/b40d808d/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
index 529ca4438..29af5bd 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
@@ -62,18 +62,27 @@ def post_upgrade_check():
   _check_datanode_startup()
 
 
-@retry(times=12, sleep_time=10, err_class=Fail)
+@retry(times=24, sleep_time=5, err_class=Fail)
 def _check_datanode_shutdown():
   """
   Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo"
   several times, pausing in between runs. Once the DataNode stops responding
   this method will return, otherwise it will raise a Fail(...) and retry
   automatically.
+  The stack defaults for retrying for HDFS are also way too slow for this
+  command; they are set to wait about 45 seconds between client retries. As
+  a result, a single execution of dfsadmin will take 45 seconds to retry and
+  the DataNode may be marked as dead, causing problems with HBase.
+  https://issues.apache.org/jira/browse/HDFS-8510 tracks reducing the
+  times for ipc.client.connect.retry.interval. In the meantime, override them
+  here, but only for RU.
   :return:
   """
   import params
 
-  command = format('hdfs dfsadmin -getDatanodeInfo {dfs_dn_ipc_address}')
+  # override stock retry timeouts since after 30 seconds, the datanode is
+  # marked as dead and can affect HBase during RU
+  command = format('hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo {dfs_dn_ipc_address}')
 
   try:
     Execute(command, user=params.hdfs_user, tries=1)

http://git-wip-us.apache.org/repos/asf/ambari/blob/b40d808d/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
index a310bf4..2440145 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
@@ -544,6 +544,33 @@ class TestDatanode(RMFTestCase):
       self.assertTrue(mocks_dict['call'].called)
       self.assertEqual(mocks_dict['call'].call_count,12)
 
+
+  @patch('time.sleep')
+  def test_stop_during_upgrade(self, time_mock):
+    config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
+    with open(config_file, "r") as f:
+      json_content = json.load(f)
+
+    version = '2.2.1.0-3242'
+    json_content['commandParams']['version'] = version
+
+    try:
+      self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/datanode.py",
+        classname = "DataNode",
+        command = "stop",
+        config_dict = json_content,
+        hdp_stack_version = self.STACK_VERSION,
+        target = RMFTestCase.TARGET_COMMON_SERVICES,
+        command_args=[True])
+
+      raise Fail("Expected a fail since datanode didn't report a shutdown")
+    except:
+      pass
+
+    self.assertResourceCalled('Execute', 'hdfs dfsadmin -shutdownDatanode 0.0.0.0:8010 upgrade', user="hdfs", tries=1)
+    self.assertResourceCalled('Execute', 'hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010', user="hdfs", tries=1)
+
+
   @patch("resource_management.libraries.functions.security_commons.build_expectations")
   @patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem")
   @patch("resource_management.libraries.functions.security_commons.validate_security_config_properties")