You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ol...@apache.org on 2016/04/25 12:04:53 UTC

ambari git commit: AMBARI-15991. DataNode and RegionServer during upgrade are reported as "failed" incorrectly (part2) (Daniel Gergely via oleewere)

Repository: ambari
Updated Branches:
  refs/heads/trunk 8a20810b2 -> 0806468be


AMBARI-15991. DataNode and RegionServer during upgrade are reported as "failed" incorrectly (part2) (Daniel Gergely via oleewere)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/0806468b
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/0806468b
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/0806468b

Branch: refs/heads/trunk
Commit: 0806468bedeb764f2d97025cb086944e482ace84
Parents: 8a20810
Author: Daniel Gergely <dg...@hortonworks.com>
Authored: Mon Apr 25 12:01:46 2016 +0200
Committer: oleewere <ol...@gmail.com>
Committed: Mon Apr 25 12:01:46 2016 +0200

----------------------------------------------------------------------
 .../HBASE/0.96.0.2.0/package/scripts/upgrade.py     | 14 ++++++--------
 .../2.1.0.2.0/package/scripts/datanode_upgrade.py   | 16 +++++++---------
 .../test/python/stacks/2.0.6/HDFS/test_datanode.py  | 10 +++++-----
 3 files changed, 18 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/0806468b/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py b/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py
index bc68cc6..b1a19e6 100644
--- a/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py
+++ b/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py
@@ -41,14 +41,8 @@ def post_regionserver(env):
   check_cmd = "echo 'status \"simple\"' | {0} shell".format(params.hbase_cmd)
 
   exec_cmd = "{0} {1}".format(params.kinit_cmd, check_cmd)
-  _wait_for_region_server_to_start(exec_cmd, params.hbase_user, params.hostname + ":", re.IGNORECASE)
+  call_and_match(exec_cmd, params.hbase_user, params.hostname + ":", re.IGNORECASE)
 
-@retry(times=3, sleep_time=300, err_class=Fail)
-def _wait_for_region_server_to_start(cmd, user, regex, regex_search_flags):
-  if not is_region_server_process_running():
-    Logger.info("RegionServer process is not running")
-    raise Fail("RegionServer process is not running")
-  call_and_match(cmd, user, regex, regex_search_flags)
 
 def is_region_server_process_running():
   try:
@@ -58,9 +52,13 @@ def is_region_server_process_running():
   except ComponentIsNotRunning:
     return False
 
-@retry(times=15, sleep_time=2, err_class=Fail)
+@retry(times=30, sleep_time=30, err_class=Fail) # keep trying for 15 mins
 def call_and_match(cmd, user, regex, regex_search_flags):
 
+  if not is_region_server_process_running():
+    Logger.info("RegionServer process is not running")
+    raise Fail("RegionServer process is not running")
+
   code, out = shell.call(cmd, user=user)
 
   if not (out and re.search(regex, out, regex_search_flags)):

http://git-wip-us.apache.org/repos/asf/ambari/blob/0806468b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
index c8e2eab..b55237d 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
@@ -73,13 +73,6 @@ def post_upgrade_check(hdfs_binary):
     Execute(params.dn_kinit_cmd, user=params.hdfs_user)
 
   # verify that the datanode has started and rejoined the HDFS cluster
-  _wait_for_datanode_to_join(hdfs_binary)
-
-@retry(times=3, sleep_time=300, err_class=Fail)
-def _wait_for_datanode_to_join(hdfs_binary):
-  if not is_datanode_process_running():
-    Logger.info("DataNode process is not running")
-    raise Fail("DataNode process is not running")
   _check_datanode_startup(hdfs_binary)
 
 
@@ -125,16 +118,21 @@ def _check_datanode_shutdown(hdfs_binary):
   raise Fail('DataNode has not shutdown.')
 
 
-@retry(times=12, sleep_time=10, err_class=Fail)
+@retry(times=30, sleep_time=30, err_class=Fail) # keep trying for 15 mins
 def _check_datanode_startup(hdfs_binary):
   """
-  Checks that a DataNode is reported as being alive via the
+  Checks that a DataNode process is running and DataNode is reported as being alive via the
   "hdfs dfsadmin -fs {namenode_address} -report -live" command. Once the DataNode is found to be
   alive this method will return, otherwise it will raise a Fail(...) and retry
   automatically.
   :param hdfs_binary: name/path of the HDFS binary to use
   :return:
   """
+
+  if not is_datanode_process_running():
+    Logger.info("DataNode process is not running")
+    raise Fail("DataNode process is not running")
+
   import params
   import socket
 

http://git-wip-us.apache.org/repos/asf/ambari/blob/0806468b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
index dbd76cf..90c12ca 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
@@ -515,7 +515,7 @@ class TestDatanode(RMFTestCase):
                        config_file = "default.json",
                        stack_version = self.STACK_VERSION,
                        target = RMFTestCase.TARGET_COMMON_SERVICES,
-                       call_mocks = [(0, shell_call_output)] * 3,
+                       call_mocks = [(0, shell_call_output)],
                        mocks_dict = mocks_dict
     )
 
@@ -535,13 +535,13 @@ class TestDatanode(RMFTestCase):
                          config_file = "default.json",
                          stack_version = self.STACK_VERSION,
                          target = RMFTestCase.TARGET_COMMON_SERVICES,
-                         call_mocks = [(0, 'There are no DataNodes here!')] * 36,
+                         call_mocks = [(0, 'There are no DataNodes here!')] * 30,
                          mocks_dict = mocks_dict
       )
       self.fail('Missing DataNode should have caused a failure')
     except Fail,fail:
       self.assertTrue(mocks_dict['call'].called)
-      self.assertEqual(mocks_dict['call'].call_count,36)
+      self.assertEqual(mocks_dict['call'].call_count,30)
 
 
   @patch("socket.gethostbyname")
@@ -556,13 +556,13 @@ class TestDatanode(RMFTestCase):
                          config_file = "default.json",
                          stack_version = self.STACK_VERSION,
                          target = RMFTestCase.TARGET_COMMON_SERVICES,
-                         call_mocks = [(1, 'some')] * 36,
+                         call_mocks = [(1, 'some')] * 30,
                          mocks_dict = mocks_dict
       )
       self.fail('Invalid return code should cause a failure')
     except Fail,fail:
       self.assertTrue(mocks_dict['call'].called)
-      self.assertEqual(mocks_dict['call'].call_count,36)
+      self.assertEqual(mocks_dict['call'].call_count,30)
 
 
   @patch("resource_management.core.shell.call")