You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by jl...@apache.org on 2015/03/13 23:28:34 UTC

ambari git commit: AMBARI-10067: ZKFailoverController failed on restart (jluniya)

Repository: ambari
Updated Branches:
  refs/heads/trunk 88aed0b8e -> a5c571e97


AMBARI-10067: ZKFailoverController failed on restart (jluniya)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/a5c571e9
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/a5c571e9
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/a5c571e9

Branch: refs/heads/trunk
Commit: a5c571e973badd90da3cb34e6c8345a3d80c9e4c
Parents: 88aed0b
Author: Jayush Luniya <jl...@hortonworks.com>
Authored: Fri Mar 13 15:28:27 2015 -0700
Committer: Jayush Luniya <jl...@hortonworks.com>
Committed: Fri Mar 13 15:28:27 2015 -0700

----------------------------------------------------------------------
 .../2.1.0.2.0/package/scripts/hdfs_namenode.py  | 45 ++++++++++----------
 .../2.1.0.2.0/package/scripts/zkfc_slave.py     | 26 ++++++++++-
 .../test/python/stacks/2.0.6/HDFS/test_zkfc.py  | 16 +++----
 3 files changed, 52 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/a5c571e9/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
index c89eeba..615dd54 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
@@ -49,9 +49,10 @@ def namenode(action=None, do_format=True, rolling_restart=False, env=None):
               group=params.user_group
     )
 
-    if params.dfs_ha_enabled:
-      # if the current host is the standby NameNode in an HA deployment
-      if params.hostname == params.dfs_ha_namenode_standby:
+    if params.dfs_ha_enabled and \
+      params.dfs_ha_namenode_standby is not None and \
+      params.hostname == params.dfs_ha_namenode_standby:
+        # if the current host is the standby NameNode in an HA deployment
         # run the bootstrap command, to start the NameNode in standby mode
         # this requires that the active NameNode is already up and running,
         # so this execute should be re-tried upon failure, up to a timeout
@@ -178,25 +179,25 @@ def format_namenode(force=None):
             recursive = True
           )
   else:
-    if params.dfs_ha_namenode_active is not None:
-      if params.hostname == params.dfs_ha_namenode_active:
-        # check and run the format command in the HA deployment scenario
-        # only format the "active" namenode in an HA deployment
-        if force:
-          ExecuteHadoop('namenode -format',
-                        kinit_override=True,
-                        bin_dir=params.hadoop_bin_dir,
-                        conf_dir=hadoop_conf_dir)
-        else:
-          if not is_namenode_formatted(params):
-            Execute(format("yes Y | hdfs --config {hadoop_conf_dir} namenode -format"),
-                    user = params.hdfs_user,
-                    path = [params.hadoop_bin_dir]
+    if params.dfs_ha_namenode_active is not None and \
+       params.hostname == params.dfs_ha_namenode_active:
+      # check and run the format command in the HA deployment scenario
+      # only format the "active" namenode in an HA deployment
+      if force:
+        ExecuteHadoop('namenode -format',
+                      kinit_override=True,
+                      bin_dir=params.hadoop_bin_dir,
+                      conf_dir=hadoop_conf_dir)
+      else:
+        if not is_namenode_formatted(params):
+          Execute(format("yes Y | hdfs --config {hadoop_conf_dir} namenode -format"),
+                  user = params.hdfs_user,
+                  path = [params.hadoop_bin_dir]
+          )
+          for m_dir in mark_dir:
+            Directory(m_dir,
+              recursive = True
             )
-            for m_dir in mark_dir:
-              Directory(m_dir,
-                recursive = True
-              )
 
 def is_namenode_formatted(params):
   old_mark_dirs = params.namenode_formatted_old_mark_dirs
@@ -289,7 +290,7 @@ def bootstrap_standby_namenode(params):
     Logger.info("Boostrapping standby namenode: %s" % (bootstrap_cmd))
     for i in range(iterations):
       Logger.info('Try %d out of %d' % (i+1, iterations))
-      code, out = shell.call(bootstrap_cmd, logoutput=True, user=params.hdfs_user)
+      code, out = shell.call(bootstrap_cmd, logoutput=False, user=params.hdfs_user)
       if code == 0:
         Logger.info("Standby namenode bootstrapped successfully")
         return True

http://git-wip-us.apache.org/repos/asf/ambari/blob/a5c571e9/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/zkfc_slave.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/zkfc_slave.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/zkfc_slave.py
index 42dc7d3..533ea7a 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/zkfc_slave.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/zkfc_slave.py
@@ -48,8 +48,11 @@ class ZkfcSlave(Script):
     # only run this format command if the active namenode hostname is set
     # The Ambari UI HA Wizard prompts the user to run this command
     # manually, so this guarantees it is only run in the Blueprints case
-    if params.dfs_ha_enabled and params.dfs_ha_namenode_active is not None:
-        Execute("hdfs zkfc -formatZK -force -nonInteractive", user=params.hdfs_user)
+    if params.dfs_ha_enabled and \
+       params.dfs_ha_namenode_active is not None:
+      success =  initialize_ha_zookeeper(params)
+      if not success:
+        raise Fail("Could not initialize HA state in zookeeper")
 
     utils.service(
       action="start", name="zkfc", user=params.hdfs_user, create_pid_dir=True,
@@ -121,6 +124,25 @@ class ZkfcSlave(Script):
     else:
       self.put_structured_out({"securityState": "UNSECURED"})
 
+def initialize_ha_zookeeper(params):
+  try:
+    iterations = 10
+    formatZK_cmd = "hdfs zkfc -formatZK -nonInteractive"
+    Logger.info("Initialize HA state in ZooKeeper: %s" % (formatZK_cmd))
+    for i in range(iterations):
+      Logger.info('Try %d out of %d' % (i+1, iterations))
+      code, out = shell.call(formatZK_cmd, logoutput=False, user=params.hdfs_user)
+      if code == 0:
+        Logger.info("HA state initialized in ZooKeeper successfully")
+        return True
+      elif code == 2:
+        Logger.info("HA state already initialized in ZooKeeper")
+        return True
+      else:
+        Logger.warning('HA state initialization in ZooKeeper failed with %d error code. Will retry' % (code))
+  except Exception as ex:
+    Logger.error('HA state initialization in ZooKeeper threw an exception. Reason %s' %(str(ex)))
+  return False
 
 if __name__ == "__main__":
   ZkfcSlave().execute()

http://git-wip-us.apache.org/repos/asf/ambari/blob/a5c571e9/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_zkfc.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_zkfc.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_zkfc.py
index 8aa4871..bc15f3f 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_zkfc.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_zkfc.py
@@ -20,6 +20,7 @@ limitations under the License.
 from stacks.utils.RMFTestCase import *
 from ambari_commons import OSCheck
 from mock.mock import MagicMock, patch
+from resource_management.core import shell
 
 class TestZkfc(RMFTestCase):
   COMMON_SERVICES_PACKAGE_DIR = "HDFS/2.1.0.2.0/package"
@@ -206,7 +207,7 @@ class TestZkfc(RMFTestCase):
                               )
     self.assertNoMoreResources()
 
-
+  @patch.object(shell, "call", new=MagicMock(return_value=(0,"")))
   def test_start_with_ha_active_namenode_bootstrap(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/zkfc_slave.py",
                        classname = "ZkfcSlave",
@@ -250,11 +251,7 @@ class TestZkfc(RMFTestCase):
                               group = 'hadoop',
                               mode = 0755
     )
-
-    # verify that the znode initialization occurs prior to ZKFC startup
-    self.assertResourceCalled('Execute', 'hdfs zkfc -formatZK -force -nonInteractive',
-                              user = 'hdfs')
-
+    # TODO: verify that the znode initialization occurs prior to ZKFC startup
     self.assertResourceCalled('Directory', '/var/run/hadoop/hdfs',
                               owner = 'hdfs',
                               recursive = True,
@@ -273,6 +270,7 @@ class TestZkfc(RMFTestCase):
                               )
     self.assertNoMoreResources()
 
+  @patch.object(shell, "call", new=MagicMock(return_value=(2,"")))
   def test_start_with_ha_standby_namenode_bootstrap(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/zkfc_slave.py",
                        classname = "ZkfcSlave",
@@ -316,11 +314,7 @@ class TestZkfc(RMFTestCase):
                               group = 'hadoop',
                               mode = 0755
     )
-
-    # verify that the znode initialization occurs prior to ZKFC startup
-    self.assertResourceCalled('Execute', 'hdfs zkfc -formatZK -force -nonInteractive',
-                              user = 'hdfs')
-
+    # TODO: verify that the znode initialization occurs prior to ZKFC startup
     self.assertResourceCalled('Directory', '/var/run/hadoop/hdfs',
                               owner = 'hdfs',
                               recursive = True,