You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by jo...@apache.org on 2016/06/15 14:20:25 UTC

ambari git commit: AMBARI-17236 - Namenode start step failed during EU with RetriableException (jonathanhurley)

Repository: ambari
Updated Branches:
  refs/heads/trunk b18ce3aef -> fd83a142e


AMBARI-17236 - Namenode start step failed during EU with RetriableException (jonathanhurley)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/fd83a142
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/fd83a142
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/fd83a142

Branch: refs/heads/trunk
Commit: fd83a142e374f155f148db0947b245edfaeb0568
Parents: b18ce3a
Author: Jonathan Hurley <jh...@hortonworks.com>
Authored: Tue Jun 14 17:10:42 2016 -0400
Committer: Jonathan Hurley <jh...@hortonworks.com>
Committed: Wed Jun 15 10:20:07 2016 -0400

----------------------------------------------------------------------
 .../libraries/resources/hdfs_resource.py        |  6 +-
 .../2.1.0.2.0/package/scripts/hdfs_namenode.py  | 87 +++++++++-----------
 .../python/stacks/2.0.6/HDFS/test_namenode.py   | 37 +++++++++
 3 files changed, 78 insertions(+), 52 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/fd83a142/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py
----------------------------------------------------------------------
diff --git a/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py b/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py
index 18e61fb..5761fd6 100644
--- a/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py
+++ b/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py
@@ -30,7 +30,7 @@ The cause is that for every call new connection initialized, with datanodes, nam
 While this resource can gather the directories/files to create/delete/copyFromLocal.
 And after just with one call create all that.
 
-action = create_delayed / delete_delayed. Are for gathering information  about what you want
+action = create_on_execute / delete_on_execute. Are for gathering information  about what you want
 to create.
 
 After everything is gathered you should execute action = execute. To perform delayed actions
@@ -52,7 +52,7 @@ class HdfsResource(Resource):
   target = ResourceArgument(default=lambda obj: obj.name)
   # "directory" or "file"
   type = ResourceArgument()
-  # "create_delayed" or "delete_delayed" or "execute"
+  # "create_on_execute" or "delete_on_execute" or "execute"
   action = ForcedListArgument()
   # if present - copies file/directory from local path {source} to hadoop path - {target}
   source = ResourceArgument()
@@ -103,5 +103,5 @@ class HdfsResource(Resource):
   dfs_type = ResourceArgument(default="")
 
   #action 'execute' immediately creates all pending files/directories in efficient manner
-  #action 'create_delayed/delete_delayed' adds file/directory to list of pending directories
+  #action 'create_on_execute/delete_on_execute' adds file/directory to list of pending directories
   actions = Resource.actions + ["create_on_execute", "delete_on_execute", "execute"]

http://git-wip-us.apache.org/repos/asf/ambari/blob/fd83a142/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
index 635f159..5a431aa 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
@@ -51,7 +51,11 @@ def wait_for_safemode_off(hdfs_binary, afterwait_sleep=0, execute_kinit=False):
   """
   import params
 
-  Logger.info("Wait to leafe safemode since must transition from ON to OFF.")
+  retries = 115
+  sleep_seconds = 10
+  sleep_minutes = int(sleep_seconds * retries / 60)
+
+  Logger.info("Waiting up to {0} minutes for the NameNode to leave Safemode...".format(sleep_minutes))
 
   if params.security_enabled and execute_kinit:
     kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}")
@@ -64,18 +68,14 @@ def wait_for_safemode_off(hdfs_binary, afterwait_sleep=0, execute_kinit=False):
     is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'"
 
     # Wait up to 30 mins
-    Execute(is_namenode_safe_mode_off,
-            tries=115,
-            try_sleep=10,
-            user=params.hdfs_user,
-            logoutput=True
-            )
+    Execute(is_namenode_safe_mode_off, tries=retries, try_sleep=sleep_seconds,
+      user=params.hdfs_user, logoutput=True)
 
     # Wait a bit more since YARN still depends on block reports coming in.
     # Also saw intermittent errors with HBASE service check if it was done too soon.
     time.sleep(afterwait_sleep)
   except Fail:
-    Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
+    Logger.error("The NameNode is still in Safemode. Please be careful with commands that need Safemode OFF.")
 
 @OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
 def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None,
@@ -159,64 +159,53 @@ def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None,
       Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
               user = params.hdfs_user)
 
-    if params.dfs_ha_enabled:
-      is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -ns {dfs_ha_nameservices} -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
-    else:
-      is_active_namenode_cmd = True
-    
-    # During NonRolling Upgrade, both NameNodes are initially down,
-    # so no point in checking if this is the active or standby.
-    if upgrade_type == "nonrolling":
-      is_active_namenode_cmd = False
-
     # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____|
     # no-HA                 | ON -> OFF                | Yes                      |
     # HA and active         | ON -> OFF                | Yes                      |
     # HA and standby        | no change                | no check                 |
     # RU with HA on active  | ON -> OFF                | Yes                      |
     # RU with HA on standby | ON -> OFF                | Yes                      |
-    # EU with HA on active  | no change                | no check                 |
-    # EU with HA on standby | no change                | no check                 |
-    # EU non-HA             | no change                | no check                 |
+    # EU with HA on active  | ON -> OFF                | Yes                      |
+    # EU with HA on standby | ON -> OFF                | Yes                      |
+    # EU non-HA             | ON -> OFF                | Yes                      |
+
+    # because we do things like create directories after starting NN,
+    # the vast majority of the time this should be True - it should only
+    # be False if this is HA and we are the Standby NN
+    ensure_safemode_off = True
+
+    # True if this is the only NameNode (non-HA) or if its the Active one in HA
+    is_active_namenode = True
 
-    check_for_safemode_off = False
-    is_active_namenode = False
-    msg = ""
     if params.dfs_ha_enabled:
-      if upgrade_type is not None:
-        check_for_safemode_off = True
-        msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade"
+      Logger.info("Waiting for the NameNode to broadcast whether it is Active or Standby...")
+      if check_is_active_namenode(hdfs_binary):
+        Logger.info("Waiting for the NameNode to leave Safemode since High Availability is enabled and it is Active...")
       else:
-        Logger.info("Wait for NameNode to become active.")
-        if check_is_active_namenode(hdfs_binary): # active
-          check_for_safemode_off = True
-          is_active_namenode = True
-          msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode."
-        else:
-          msg = "Will remain in the current safemode state."
+        # we are the STANDBY NN
+        ensure_safemode_off = False
+        is_active_namenode = False
+        Logger.info("This is the Standby NameNode; proceeding without waiting for it to leave Safemode")
     else:
-      msg = "Must wait to leave safemode since High Availability is not enabled."
-      check_for_safemode_off = True
-      is_active_namenode = True
-
-    Logger.info(msg)
+      Logger.info("Waiting for the NameNode to leave Safemode...")
 
-    # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down.
-    stay_in_safe_mode = False
+    # During an Express Upgrade, NameNode will not leave SafeMode until the DataNodes are started
     if upgrade_type == "nonrolling":
-      stay_in_safe_mode = True
+      Logger.info("An express upgrade has been detected and this NameNode will not leave Safemode until DataNodes are started. Safemode does not need to end before proceeding.")
+      ensure_safemode_off = False
 
-    if check_for_safemode_off:
-      Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode))
-      if not stay_in_safe_mode:
-        wait_for_safemode_off(hdfs_binary)
+    # wait for Safemode to end
+    if ensure_safemode_off:
+      wait_for_safemode_off(hdfs_binary)
 
-    # Always run this on non-HA, or active NameNode during HA.
-    if is_active_namenode:
+    # Always run this on the "Active" NN unless Safemode has been ignored
+    # in the case where safemode was ignored (like during an express upgrade), then
+    # NN will be in SafeMode and cannot have directories created
+    if is_active_namenode and ensure_safemode_off:
       create_hdfs_directories()
       create_ranger_audit_hdfs_directories()
     else:
-      Logger.info("Skipping creating hdfs directories as is not active NN.")
+      Logger.info("Skipping creation of HDFS directories since this is not the Active NameNode.")
 
   elif action == "stop":
     import params

http://git-wip-us.apache.org/repos/asf/ambari/blob/fd83a142/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
index 4a89b0a..41c7366 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
@@ -1613,6 +1613,8 @@ class TestNamenode(RMFTestCase):
     self.assertResourceCalled('Execute',
                               ('ambari-python-wrap', '/usr/bin/hdp-select', 'set', 'hadoop-hdfs-namenode', version), sudo=True)
 
+    self.assertNoMoreResources()
+
 
   @patch("resource_management.core.shell.call")
   def test_pre_upgrade_restart_23(self, call_mock):
@@ -1872,6 +1874,41 @@ class TestNamenode(RMFTestCase):
     self.assertEquals("/usr/hdp/2.3.0.0-1234/hadoop/sbin", sys.modules["params"].hadoop_bin)
 
 
+  @patch("namenode_upgrade.create_upgrade_marker", MagicMock())
+  def test_express_upgrade_skips_safemode_and_directory_creation(self):
+    """
+    Tests that we wait for Safemode to be OFF no matter what except for EU. And, because of that,
+    EUs don't try to create HDFS resources.
+
+    :param self:
+    :param create_upgrade_marker_mock:
+    :return:
+    """
+    config_file = self.get_src_folder() + "/test/python/stacks/2.0.6/configs/default.json"
+    with open(config_file, "r") as f:
+      json_content = json.load(f)
+
+    version = '2.3.0.0-1234'
+    json_content['commandParams']['version'] = version
+
+    mocks_dict = {}
+    self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/namenode.py",
+      classname = "NameNode",
+      command = "start",
+      command_args = ["nonrolling"],
+      config_dict = json_content,
+      stack_version = self.STACK_VERSION,
+      target = RMFTestCase.TARGET_COMMON_SERVICES,
+      call_mocks = [(0, None, ''), (0, None)],
+      mocks_dict = mocks_dict)
+
+    # jump right to the start of the NN and then verify that we DO NOT call HdfsResource after
+    self.assertResourceCalledIgnoreEarlier('Execute',
+      "ambari-sudo.sh su hdfs -l -s /bin/bash -c '[RMF_EXPORT_PLACEHOLDER]ulimit -c unlimited ;  /usr/lib/hadoop/sbin/hadoop-daemon.sh --config /etc/hadoop/conf start namenode'",
+      environment = {'HADOOP_LIBEXEC_DIR':'/usr/lib/hadoop/libexec'},
+      not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid")
+
+    self.assertNoMoreResources()
 
 class Popen_Mock:
   return_value = 1