You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by dm...@apache.org on 2015/12/29 13:23:12 UTC

[3/4] ambari git commit: AMBARI-14479. Namenode start fails when time taken to get out of safemode is more than 20 minutes. (additional patch) (dlysnichenko)

AMBARI-14479. Namenode start fails when time taken to get out of safemode is more than 20 minutes. (additional patch) (dlysnichenko)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/305e67be
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/305e67be
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/305e67be

Branch: refs/heads/branch-2.2
Commit: 305e67be76e7cfc5f594b3e3c10887b7c5bfcfd1
Parents: 45304d9
Author: Lisnichenko Dmitro <dl...@hortonworks.com>
Authored: Tue Dec 29 14:22:14 2015 +0200
Committer: Lisnichenko Dmitro <dl...@hortonworks.com>
Committed: Tue Dec 29 14:22:14 2015 +0200

----------------------------------------------------------------------
 .../2.1.0.2.0/package/scripts/hdfs_namenode.py  | 49 ++++++++++++++------
 .../HDFS/2.1.0.2.0/package/scripts/namenode.py  | 36 +-------------
 .../python/stacks/2.0.6/HDFS/test_namenode.py   | 18 +++----
 3 files changed, 47 insertions(+), 56 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/305e67be/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
index 1766c44..d61dc2e 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
@@ -42,6 +42,40 @@ from utils import service, safe_zkfc_op, is_previous_fs_image
 from setup_ranger_hdfs import setup_ranger_hdfs, create_ranger_audit_hdfs_directories
 
 
+def wait_for_safemode_off(hdfs_binary, afterwait_sleep=0, execute_kinit=False):
+  """
+  During NonRolling (aka Express Upgrade), after starting NameNode, which is still in safemode, and then starting
+  all of the DataNodes, we need for NameNode to receive all of the block reports and leave safemode.
+  If HA is present, then this command will run individually on each NameNode, which checks for its own address.
+  """
+  import params
+
+  Logger.info("Wait to leafe safemode since must transition from ON to OFF.")
+
+  if params.security_enabled and execute_kinit:
+    kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}")
+    Execute(kinit_command, user=params.hdfs_user, logoutput=True)
+
+  try:
+    # Note, this fails if namenode_address isn't prefixed with "params."
+
+    dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary, use_specific_namenode=True)
+    is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'"
+
+    # Wait up to 30 mins
+    Execute(is_namenode_safe_mode_off,
+            tries=115,
+            try_sleep=10,
+            user=params.hdfs_user,
+            logoutput=True
+            )
+
+    # Wait a bit more since YARN still depends on block reports coming in.
+    # Also saw intermittent errors with HBASE service check if it was done too soon.
+    time.sleep(afterwait_sleep)
+  except Fail:
+    Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
+
 @OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
 def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, env=None):
   if action is None:
@@ -115,8 +149,7 @@ def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, e
     if params.security_enabled:
       Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
               user = params.hdfs_user)
-    dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary, use_specific_namenode=True)
-    is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'"
+
     if params.dfs_ha_enabled:
       is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
     else:
@@ -164,17 +197,7 @@ def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, e
     if check_for_safemode_off:
       Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode))
       if not stay_in_safe_mode:
-        Logger.info("Wait to leafe safemode since must transition from ON to OFF.")
-        try:
-          # Wait up to 30 mins
-          Execute(is_namenode_safe_mode_off,
-                  tries=65,
-                  try_sleep=10,
-                  user=params.hdfs_user,
-                  logoutput=True
-          )
-        except Fail:
-          Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
+        wait_for_safemode_off(hdfs_binary)
 
     # Always run this on non-HA, or active NameNode during HA.
     create_hdfs_directories(is_active_namenode_cmd)

http://git-wip-us.apache.org/repos/asf/ambari/blob/305e67be/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
index 67db735..b308680 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
@@ -47,7 +47,7 @@ from ambari_commons import OSConst
 
 
 import namenode_upgrade
-from hdfs_namenode import namenode
+from hdfs_namenode import namenode, wait_for_safemode_off
 from hdfs import hdfs
 import hdfs_rebalance
 from utils import initiate_safe_zkfc_failover, get_hdfs_binary, get_dfsadmin_base_command
@@ -175,39 +175,7 @@ class NameNodeDefault(NameNode):
     namenode_upgrade.prepare_rolling_upgrade(hfds_binary)
 
   def wait_for_safemode_off(self, env):
-    """
-    During NonRolling (aka Express Upgrade), after starting NameNode, which is still in safemode, and then starting
-    all of the DataNodes, we need for NameNode to receive all of the block reports and leave safemode.
-    If HA is present, then this command will run individually on each NameNode, which checks for its own address.
-    """
-    import params
-
-    Logger.info("Wait to leafe safemode since must transition from ON to OFF.")
-
-    if params.security_enabled:
-      kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}")
-      Execute(kinit_command, user=params.hdfs_user, logoutput=True)
-
-    try:
-      hdfs_binary = self.get_hdfs_binary()
-      # Note, this fails if namenode_address isn't prefixed with "params."
-
-      dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary, use_specific_namenode=True)
-      is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'"
-
-      # Wait up to 30 mins
-      Execute(is_namenode_safe_mode_off,
-              tries=65,
-              try_sleep=10,
-              user=params.hdfs_user,
-              logoutput=True
-      )
-
-      # Wait a bit more since YARN still depends on block reports coming in.
-      # Also saw intermittent errors with HBASE service check if it was done too soon.
-      time.sleep(30)
-    except Fail:
-      Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
+    wait_for_safemode_off(self.get_hdfs_binary(), 30, True)
 
   def finalize_non_rolling_upgrade(self, env):
     hfds_binary = self.get_hdfs_binary()

http://git-wip-us.apache.org/repos/asf/ambari/blob/305e67be/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
index 399fd8d..c79cd5f 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py
@@ -89,7 +89,7 @@ class TestNamenode(RMFTestCase):
         not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
     )
     self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6405.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
-        tries=65,
+        tries=115,
         try_sleep=10,
         user="hdfs",
         logoutput=True
@@ -199,7 +199,7 @@ class TestNamenode(RMFTestCase):
         not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
     )
     self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
-        tries=65,
+        tries=115,
         try_sleep=10,
         user="hdfs",
         logoutput=True
@@ -325,7 +325,7 @@ class TestNamenode(RMFTestCase):
                               user='hdfs',
                               )
     self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
-        tries=65,
+        tries=115,
         try_sleep=10,
         user="hdfs",
         logoutput=True
@@ -418,7 +418,7 @@ class TestNamenode(RMFTestCase):
         not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
     )
     self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
-        tries=65,
+        tries=115,
         try_sleep=10,
         user="hdfs",
         logoutput=True
@@ -512,7 +512,7 @@ class TestNamenode(RMFTestCase):
         not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
     )
     self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
-        tries=65,
+        tries=115,
         try_sleep=10,
         user="hdfs",
         logoutput=True
@@ -612,7 +612,7 @@ class TestNamenode(RMFTestCase):
         user = 'hdfs',
     )
     self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
-        tries=65,
+        tries=115,
         try_sleep=10,
         user="hdfs",
         logoutput=True
@@ -712,7 +712,7 @@ class TestNamenode(RMFTestCase):
         not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
     )
     self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
-        tries=65,
+        tries=115,
         try_sleep=10,
         user="hdfs",
         logoutput=True
@@ -811,7 +811,7 @@ class TestNamenode(RMFTestCase):
         not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
     )
     self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6402.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
-        tries=65,
+        tries=115,
         try_sleep=10,
         user="hdfs",
         logoutput=True
@@ -918,7 +918,7 @@ class TestNamenode(RMFTestCase):
                               not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
                               )
     self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6402.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
-                              tries=65,
+                              tries=115,
                               try_sleep=10,
                               user="hdfs",
                               logoutput=True