You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by jo...@apache.org on 2016/06/15 14:20:25 UTC
ambari git commit: AMBARI-17236 - Namenode start step failed during
EU with RetriableException (jonathanhurley)
Repository: ambari
Updated Branches:
refs/heads/trunk b18ce3aef -> fd83a142e
AMBARI-17236 - Namenode start step failed during EU with RetriableException (jonathanhurley)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/fd83a142
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/fd83a142
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/fd83a142
Branch: refs/heads/trunk
Commit: fd83a142e374f155f148db0947b245edfaeb0568
Parents: b18ce3a
Author: Jonathan Hurley <jh...@hortonworks.com>
Authored: Tue Jun 14 17:10:42 2016 -0400
Committer: Jonathan Hurley <jh...@hortonworks.com>
Committed: Wed Jun 15 10:20:07 2016 -0400
----------------------------------------------------------------------
.../libraries/resources/hdfs_resource.py | 6 +-
.../2.1.0.2.0/package/scripts/hdfs_namenode.py | 87 +++++++++-----------
.../python/stacks/2.0.6/HDFS/test_namenode.py | 37 +++++++++
3 files changed, 78 insertions(+), 52 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/fd83a142/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py
----------------------------------------------------------------------
diff --git a/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py b/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py
index 18e61fb..5761fd6 100644
--- a/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py
+++ b/ambari-common/src/main/python/resource_management/libraries/resources/hdfs_resource.py
@@ -30,7 +30,7 @@ The cause is that for every call new connection initialized, with datanodes, nam
While this resource can gather the directories/files to create/delete/copyFromLocal.
And after just with one call create all that.
-action = create_delayed / delete_delayed. Are for gathering information about what you want
+action = create_on_execute / delete_on_execute. Are for gathering information about what you want
to create.
After everything is gathered you should execute action = execute. To perform delayed actions
@@ -52,7 +52,7 @@ class HdfsResource(Resource):
target = ResourceArgument(default=lambda obj: obj.name)
# "directory" or "file"
type = ResourceArgument()
- # "create_delayed" or "delete_delayed" or "execute"
+ # "create_on_execute" or "delete_on_execute" or "execute"
action = ForcedListArgument()
# if present - copies file/directory from local path {source} to hadoop path - {target}
source = ResourceArgument()
@@ -103,5 +103,5 @@ class HdfsResource(Resource):
dfs_type = ResourceArgument(default="")
#action 'execute' immediately creates all pending files/directories in efficient manner
- #action 'create_delayed/delete_delayed' adds file/directory to list of pending directories
+ #action 'create_on_execute/delete_on_execute' adds file/directory to list of pending directories
actions = Resource.actions + ["create_on_execute", "delete_on_execute", "execute"]
http://git-wip-us.apache.org/repos/asf/ambari/blob/fd83a142/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
index 635f159..5a431aa 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
@@ -51,7 +51,11 @@ def wait_for_safemode_off(hdfs_binary, afterwait_sleep=0, execute_kinit=False):
"""
import params
- Logger.info("Wait to leafe safemode since must transition from ON to OFF.")
+ retries = 115
+ sleep_seconds = 10
+ sleep_minutes = int(sleep_seconds * retries / 60)
+
+ Logger.info("Waiting up to {0} minutes for the NameNode to leave Safemode...".format(sleep_minutes))
if params.security_enabled and execute_kinit:
kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}")
@@ -64,18 +68,14 @@ def wait_for_safemode_off(hdfs_binary, afterwait_sleep=0, execute_kinit=False):
is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'"
# Wait up to 30 mins
- Execute(is_namenode_safe_mode_off,
- tries=115,
- try_sleep=10,
- user=params.hdfs_user,
- logoutput=True
- )
+ Execute(is_namenode_safe_mode_off, tries=retries, try_sleep=sleep_seconds,
+ user=params.hdfs_user, logoutput=True)
# Wait a bit more since YARN still depends on block reports coming in.
# Also saw intermittent errors with HBASE service check if it was done too soon.
time.sleep(afterwait_sleep)
except Fail:
- Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
+ Logger.error("The NameNode is still in Safemode. Please be careful with commands that need Safemode OFF.")
@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None,
@@ -159,64 +159,53 @@ def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None,
Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
user = params.hdfs_user)
- if params.dfs_ha_enabled:
- is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -ns {dfs_ha_nameservices} -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
- else:
- is_active_namenode_cmd = True
-
- # During NonRolling Upgrade, both NameNodes are initially down,
- # so no point in checking if this is the active or standby.
- if upgrade_type == "nonrolling":
- is_active_namenode_cmd = False
-
# ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____|
# no-HA | ON -> OFF | Yes |
# HA and active | ON -> OFF | Yes |
# HA and standby | no change | no check |
# RU with HA on active | ON -> OFF | Yes |
# RU with HA on standby | ON -> OFF | Yes |
- # EU with HA on active | no change | no check |
- # EU with HA on standby | no change | no check |
- # EU non-HA | no change | no check |
+ # EU with HA on active | ON -> OFF | Yes |
+ # EU with HA on standby | ON -> OFF | Yes |
+ # EU non-HA | ON -> OFF | Yes |
+
+ # because we do things like create directories after starting NN,
+ # the vast majority of the time this should be True - it should only
+ # be False if this is HA and we are the Standby NN
+ ensure_safemode_off = True
+
+ # True if this is the only NameNode (non-HA) or if its the Active one in HA
+ is_active_namenode = True
- check_for_safemode_off = False
- is_active_namenode = False
- msg = ""
if params.dfs_ha_enabled:
- if upgrade_type is not None:
- check_for_safemode_off = True
- msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade"
+ Logger.info("Waiting for the NameNode to broadcast whether it is Active or Standby...")
+ if check_is_active_namenode(hdfs_binary):
+ Logger.info("Waiting for the NameNode to leave Safemode since High Availability is enabled and it is Active...")
else:
- Logger.info("Wait for NameNode to become active.")
- if check_is_active_namenode(hdfs_binary): # active
- check_for_safemode_off = True
- is_active_namenode = True
- msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode."
- else:
- msg = "Will remain in the current safemode state."
+ # we are the STANDBY NN
+ ensure_safemode_off = False
+ is_active_namenode = False
+ Logger.info("This is the Standby NameNode; proceeding without waiting for it to leave Safemode")
else:
- msg = "Must wait to leave safemode since High Availability is not enabled."
- check_for_safemode_off = True
- is_active_namenode = True
-
- Logger.info(msg)
+ Logger.info("Waiting for the NameNode to leave Safemode...")
- # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down.
- stay_in_safe_mode = False
+ # During an Express Upgrade, NameNode will not leave SafeMode until the DataNodes are started
if upgrade_type == "nonrolling":
- stay_in_safe_mode = True
+ Logger.info("An express upgrade has been detected and this NameNode will not leave Safemode until DataNodes are started. Safemode does not need to end before proceeding.")
+ ensure_safemode_off = False
- if check_for_safemode_off:
- Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode))
- if not stay_in_safe_mode:
- wait_for_safemode_off(hdfs_binary)
+ # wait for Safemode to end
+ if ensure_safemode_off:
+ wait_for_safemode_off(hdfs_binary)
- # Always run this on non-HA, or active NameNode during HA.
- if is_active_namenode:
+ # Always run this on the "Active" NN unless Safemode has been ignored
+ # in the case where safemode was ignored (like during an express upgrade), then
+ # NN will be in SafeMode and cannot have directories created
+ if is_active_namenode and ensure_safemode_off:
create_hdfs_directories()
create_ranger_audit_hdfs_directories()
else:
- Logger.info("Skipping creating hdfs directories as is not active NN.")
+ Logger.info("Skipping creation of HDFS directories since this is not the Active NameNode.")
elif action == "stop":
import params
http://git-wip-us.apache.org/repos/asf/ambari/blob/fd83a142/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
index 4a89b0a..41c7366 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
@@ -1613,6 +1613,8 @@ class TestNamenode(RMFTestCase):
self.assertResourceCalled('Execute',
('ambari-python-wrap', '/usr/bin/hdp-select', 'set', 'hadoop-hdfs-namenode', version), sudo=True)
+ self.assertNoMoreResources()
+
@patch("resource_management.core.shell.call")
def test_pre_upgrade_restart_23(self, call_mock):
@@ -1872,6 +1874,41 @@ class TestNamenode(RMFTestCase):
self.assertEquals("/usr/hdp/2.3.0.0-1234/hadoop/sbin", sys.modules["params"].hadoop_bin)
+ @patch("namenode_upgrade.create_upgrade_marker", MagicMock())
+ def test_express_upgrade_skips_safemode_and_directory_creation(self):
+ """
+ Tests that we wait for Safemode to be OFF no matter what except for EU. And, because of that,
+ EUs don't try to create HDFS resources.
+
+ :param self:
+ :param create_upgrade_marker_mock:
+ :return:
+ """
+ config_file = self.get_src_folder() + "/test/python/stacks/2.0.6/configs/default.json"
+ with open(config_file, "r") as f:
+ json_content = json.load(f)
+
+ version = '2.3.0.0-1234'
+ json_content['commandParams']['version'] = version
+
+ mocks_dict = {}
+ self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/namenode.py",
+ classname = "NameNode",
+ command = "start",
+ command_args = ["nonrolling"],
+ config_dict = json_content,
+ stack_version = self.STACK_VERSION,
+ target = RMFTestCase.TARGET_COMMON_SERVICES,
+ call_mocks = [(0, None, ''), (0, None)],
+ mocks_dict = mocks_dict)
+
+ # jump right to the start of the NN and then verify that we DO NOT call HdfsResource after
+ self.assertResourceCalledIgnoreEarlier('Execute',
+ "ambari-sudo.sh su hdfs -l -s /bin/bash -c '[RMF_EXPORT_PLACEHOLDER]ulimit -c unlimited ; /usr/lib/hadoop/sbin/hadoop-daemon.sh --config /etc/hadoop/conf start namenode'",
+ environment = {'HADOOP_LIBEXEC_DIR':'/usr/lib/hadoop/libexec'},
+ not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid")
+
+ self.assertNoMoreResources()
class Popen_Mock:
return_value = 1