You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by nc...@apache.org on 2015/10/19 15:54:19 UTC

[33/50] [abbrv] ambari git commit: AMBARI-13427: NAMENODE START failed with both NN's being passive (jluniya)

AMBARI-13427: NAMENODE START failed with both NN's being passive (jluniya)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/3318eb68
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/3318eb68
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/3318eb68

Branch: refs/heads/branch-dev-patch-upgrade
Commit: 3318eb682611fffbe817815cdffdf6aca2aacfa2
Parents: dec7c8e
Author: Jayush Luniya <jl...@hortonworks.com>
Authored: Fri Oct 16 11:30:38 2015 -0700
Committer: Jayush Luniya <jl...@hortonworks.com>
Committed: Fri Oct 16 11:30:38 2015 -0700

----------------------------------------------------------------------
 .../libraries/functions/decorator.py            |  5 ++--
 .../libraries/functions/namenode_ha_utils.py    | 27 ++++++++++++++++++--
 .../python/stacks/2.0.6/HDFS/test_namenode.py   | 17 +++++++++++-
 3 files changed, 44 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/3318eb68/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py
----------------------------------------------------------------------
diff --git a/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py b/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py
index cd653e5..1b45981 100644
--- a/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py
+++ b/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py
@@ -26,7 +26,7 @@ __all__ = ['retry', ]
 from resource_management.core.logger import Logger
 
 
-def retry(times=3, sleep_time=1, backoff_factor=1, err_class=Exception):
+def retry(times=3, sleep_time=1, max_sleep_time=8, backoff_factor=1, err_class=Exception):
   """
   Retry decorator for improved robustness of functions.
   :param times: Number of times to attempt to call the function.
@@ -44,12 +44,13 @@ def retry(times=3, sleep_time=1, backoff_factor=1, err_class=Exception):
 
       while _times > 1:
         _times -= 1
-        _sleep_time *= _backoff_factor
         try:
           return function(*args, **kwargs)
         except _err_class, err:
           Logger.info("Will retry %d time(s), caught exception: %s. Sleeping for %d sec(s)" % (_times, str(err), _sleep_time))
           time.sleep(_sleep_time)
+        if(_sleep_time * _backoff_factor <= max_sleep_time):
+          _sleep_time *= _backoff_factor
 
       return function(*args, **kwargs)
     return wrapper

http://git-wip-us.apache.org/repos/asf/ambari/blob/3318eb68/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py
----------------------------------------------------------------------
diff --git a/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py b/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py
index 99f90b8..0920e85 100644
--- a/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py
+++ b/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py
@@ -23,6 +23,8 @@ from resource_management.libraries.functions.format import format
 from resource_management.libraries.functions.jmx import get_value_from_jmx
 from resource_management.core.base import Fail
 from resource_management.core import shell
+from resource_management.core.logger import Logger
+from resource_management.libraries.functions.decorator import retry
 
 __all__ = ["get_namenode_states", "get_active_namenode", "get_property_for_active_namenode"]
 
@@ -32,8 +34,29 @@ HDFS_NN_STATE_STANDBY = 'standby'
 NAMENODE_HTTP_FRAGMENT = 'dfs.namenode.http-address.{0}.{1}'
 NAMENODE_HTTPS_FRAGMENT = 'dfs.namenode.https-address.{0}.{1}'
 JMX_URI_FRAGMENT = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem"
-  
-def get_namenode_states(hdfs_site, security_enabled, run_user):
+
+def get_namenode_states(hdfs_site, security_enabled, run_user, times=10, sleep_time=1, backoff_factor=2):
+  """
+  return format [('nn1', 'hdfs://hostname1:port1'), ('nn2', 'hdfs://hostname2:port2')] , [....], [....]
+  """
+  @retry(times=times, sleep_time=sleep_time, backoff_factor=backoff_factor, err_class=Fail)
+  def doRetries(hdfs_site, security_enabled, run_user):
+    doRetries.attempt += 1
+    active_namenodes, standby_namenodes, unknown_namenodes = get_namenode_states_noretries(hdfs_site, security_enabled, run_user)
+    Logger.info(
+      "NameNode HA states: active_namenodes = {0}, standby_namenodes = {1}, unknown_namenodes = {2}".format(
+        active_namenodes, standby_namenodes, unknown_namenodes))
+    if active_namenodes:
+      return active_namenodes, standby_namenodes, unknown_namenodes
+    elif doRetries.attempt == times:
+      Logger.warning("No active NameNode was found after {0} retries. Will return current NameNode HA states".format(times))
+      return active_namenodes, standby_namenodes, unknown_namenodes
+    raise Fail('No active NameNode was found.')
+
+  doRetries.attempt = 0
+  return doRetries(hdfs_site, security_enabled, run_user)
+
+def get_namenode_states_noretries(hdfs_site, security_enabled, run_user):
   """
   return format [('nn1', 'hdfs://hostname1:port1'), ('nn2', 'hdfs://hostname2:port2')] , [....], [....]
   """

http://git-wip-us.apache.org/repos/asf/ambari/blob/3318eb68/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
index 3378892..e954a84 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
@@ -1275,10 +1275,16 @@ class TestNamenode(RMFTestCase):
     put_structured_out_mock.assert_called_with({"securityState": "UNSECURED"})
 
 
-  def test_upgrade_restart(self):
+  @patch("utils.get_namenode_states")
+  def test_upgrade_restart(self, get_namenode_states_mock):
     #   Execution of nn_ru_lzo invokes a code path that invokes lzo installation, which
     #   was failing in RU case.  See hdfs.py and the lzo_enabled check that is in it.
     #   Just executing the script is enough to test the fix
+    active_namenodes = [('nn1', 'c6401.ambari.apache.org:50070')]
+    standby_namenodes = [('nn2', 'c6402.ambari.apache.org:50070')]
+    unknown_namenodes = []
+
+    get_namenode_states_mock.return_value = active_namenodes, standby_namenodes, unknown_namenodes
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/namenode.py",
                        classname = "NameNode",
                        command = "restart",
@@ -1286,6 +1292,15 @@ class TestNamenode(RMFTestCase):
                        hdp_stack_version = self.STACK_VERSION,
                        target = RMFTestCase.TARGET_COMMON_SERVICES)
 
+    unknown_namenodes = active_namenodes
+    active_namenodes = []
+    get_namenode_states_mock.return_value = active_namenodes, standby_namenodes, unknown_namenodes
+    self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/namenode.py",
+                     classname = "NameNode",
+                     command = "restart",
+                     config_file = "nn_ru_lzo.json",
+                     hdp_stack_version = self.STACK_VERSION,
+                     target = RMFTestCase.TARGET_COMMON_SERVICES)
 
   def test_pre_rolling_restart(self):
     config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"