You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by dm...@apache.org on 2015/12/29 13:23:12 UTC
[3/4] ambari git commit: AMBARI-14479. Namenode start fails when time
taken to get out of safemode is more than 20 minutes. (additional patch)
(dlysnichenko)
AMBARI-14479. Namenode start fails when time taken to get out of safemode is more than 20 minutes. (additional patch) (dlysnichenko)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/305e67be
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/305e67be
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/305e67be
Branch: refs/heads/branch-2.2
Commit: 305e67be76e7cfc5f594b3e3c10887b7c5bfcfd1
Parents: 45304d9
Author: Lisnichenko Dmitro <dl...@hortonworks.com>
Authored: Tue Dec 29 14:22:14 2015 +0200
Committer: Lisnichenko Dmitro <dl...@hortonworks.com>
Committed: Tue Dec 29 14:22:14 2015 +0200
----------------------------------------------------------------------
.../2.1.0.2.0/package/scripts/hdfs_namenode.py | 49 ++++++++++++++------
.../HDFS/2.1.0.2.0/package/scripts/namenode.py | 36 +-------------
.../python/stacks/2.0.6/HDFS/test_namenode.py | 18 +++----
3 files changed, 47 insertions(+), 56 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/305e67be/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
index 1766c44..d61dc2e 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
@@ -42,6 +42,40 @@ from utils import service, safe_zkfc_op, is_previous_fs_image
from setup_ranger_hdfs import setup_ranger_hdfs, create_ranger_audit_hdfs_directories
+def wait_for_safemode_off(hdfs_binary, afterwait_sleep=0, execute_kinit=False):
+ """
+ During NonRolling (aka Express Upgrade), after starting NameNode, which is still in safemode, and then starting
+ all of the DataNodes, we need for NameNode to receive all of the block reports and leave safemode.
+ If HA is present, then this command will run individually on each NameNode, which checks for its own address.
+ """
+ import params
+
+ Logger.info("Wait to leafe safemode since must transition from ON to OFF.")
+
+ if params.security_enabled and execute_kinit:
+ kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}")
+ Execute(kinit_command, user=params.hdfs_user, logoutput=True)
+
+ try:
+ # Note, this fails if namenode_address isn't prefixed with "params."
+
+ dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary, use_specific_namenode=True)
+ is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'"
+
+ # Wait up to 30 mins
+ Execute(is_namenode_safe_mode_off,
+ tries=115,
+ try_sleep=10,
+ user=params.hdfs_user,
+ logoutput=True
+ )
+
+ # Wait a bit more since YARN still depends on block reports coming in.
+ # Also saw intermittent errors with HBASE service check if it was done too soon.
+ time.sleep(afterwait_sleep)
+ except Fail:
+ Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
+
@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, env=None):
if action is None:
@@ -115,8 +149,7 @@ def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, e
if params.security_enabled:
Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
user = params.hdfs_user)
- dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary, use_specific_namenode=True)
- is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'"
+
if params.dfs_ha_enabled:
is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
else:
@@ -164,17 +197,7 @@ def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, e
if check_for_safemode_off:
Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode))
if not stay_in_safe_mode:
- Logger.info("Wait to leafe safemode since must transition from ON to OFF.")
- try:
- # Wait up to 30 mins
- Execute(is_namenode_safe_mode_off,
- tries=65,
- try_sleep=10,
- user=params.hdfs_user,
- logoutput=True
- )
- except Fail:
- Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
+ wait_for_safemode_off(hdfs_binary)
# Always run this on non-HA, or active NameNode during HA.
create_hdfs_directories(is_active_namenode_cmd)
http://git-wip-us.apache.org/repos/asf/ambari/blob/305e67be/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
index 67db735..b308680 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
@@ -47,7 +47,7 @@ from ambari_commons import OSConst
import namenode_upgrade
-from hdfs_namenode import namenode
+from hdfs_namenode import namenode, wait_for_safemode_off
from hdfs import hdfs
import hdfs_rebalance
from utils import initiate_safe_zkfc_failover, get_hdfs_binary, get_dfsadmin_base_command
@@ -175,39 +175,7 @@ class NameNodeDefault(NameNode):
namenode_upgrade.prepare_rolling_upgrade(hfds_binary)
def wait_for_safemode_off(self, env):
- """
- During NonRolling (aka Express Upgrade), after starting NameNode, which is still in safemode, and then starting
- all of the DataNodes, we need for NameNode to receive all of the block reports and leave safemode.
- If HA is present, then this command will run individually on each NameNode, which checks for its own address.
- """
- import params
-
- Logger.info("Wait to leafe safemode since must transition from ON to OFF.")
-
- if params.security_enabled:
- kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}")
- Execute(kinit_command, user=params.hdfs_user, logoutput=True)
-
- try:
- hdfs_binary = self.get_hdfs_binary()
- # Note, this fails if namenode_address isn't prefixed with "params."
-
- dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary, use_specific_namenode=True)
- is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'"
-
- # Wait up to 30 mins
- Execute(is_namenode_safe_mode_off,
- tries=65,
- try_sleep=10,
- user=params.hdfs_user,
- logoutput=True
- )
-
- # Wait a bit more since YARN still depends on block reports coming in.
- # Also saw intermittent errors with HBASE service check if it was done too soon.
- time.sleep(30)
- except Fail:
- Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
+ wait_for_safemode_off(self.get_hdfs_binary(), 30, True)
def finalize_non_rolling_upgrade(self, env):
hfds_binary = self.get_hdfs_binary()
http://git-wip-us.apache.org/repos/asf/ambari/blob/305e67be/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
index 399fd8d..c79cd5f 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
@@ -89,7 +89,7 @@ class TestNamenode(RMFTestCase):
not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
)
self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6405.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
- tries=65,
+ tries=115,
try_sleep=10,
user="hdfs",
logoutput=True
@@ -199,7 +199,7 @@ class TestNamenode(RMFTestCase):
not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
)
self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
- tries=65,
+ tries=115,
try_sleep=10,
user="hdfs",
logoutput=True
@@ -325,7 +325,7 @@ class TestNamenode(RMFTestCase):
user='hdfs',
)
self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
- tries=65,
+ tries=115,
try_sleep=10,
user="hdfs",
logoutput=True
@@ -418,7 +418,7 @@ class TestNamenode(RMFTestCase):
not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
)
self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
- tries=65,
+ tries=115,
try_sleep=10,
user="hdfs",
logoutput=True
@@ -512,7 +512,7 @@ class TestNamenode(RMFTestCase):
not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
)
self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
- tries=65,
+ tries=115,
try_sleep=10,
user="hdfs",
logoutput=True
@@ -612,7 +612,7 @@ class TestNamenode(RMFTestCase):
user = 'hdfs',
)
self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
- tries=65,
+ tries=115,
try_sleep=10,
user="hdfs",
logoutput=True
@@ -712,7 +712,7 @@ class TestNamenode(RMFTestCase):
not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
)
self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
- tries=65,
+ tries=115,
try_sleep=10,
user="hdfs",
logoutput=True
@@ -811,7 +811,7 @@ class TestNamenode(RMFTestCase):
not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
)
self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6402.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
- tries=65,
+ tries=115,
try_sleep=10,
user="hdfs",
logoutput=True
@@ -918,7 +918,7 @@ class TestNamenode(RMFTestCase):
not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
)
self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6402.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
- tries=65,
+ tries=115,
try_sleep=10,
user="hdfs",
logoutput=True