You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by jo...@apache.org on 2014/12/16 02:14:17 UTC

ambari git commit: AMBARI-8694 - Upgrade Pack definition for Core Slaves Components (jonathanhurley)

Repository: ambari
Updated Branches:
  refs/heads/trunk ff7f90843 -> 5b0ccc012


AMBARI-8694 - Upgrade Pack definition for Core Slaves Components (jonathanhurley)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/5b0ccc01
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/5b0ccc01
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/5b0ccc01

Branch: refs/heads/trunk
Commit: 5b0ccc0120f2f302ad788acb199ec2b3ff8dff75
Parents: ff7f908
Author: Jonathan Hurley <jh...@hortonworks.com>
Authored: Fri Dec 12 17:04:41 2014 -0500
Committer: Jonathan Hurley <jh...@hortonworks.com>
Committed: Mon Dec 15 20:13:10 2014 -0500

----------------------------------------------------------------------
 .../apache/ambari/server/orm/dao/AlertsDAO.java |   7 +-
 .../services/HDFS/package/scripts/datanode.py   |  26 ++++-
 .../HDFS/package/scripts/datanode_upgrade.py    | 114 +++++++++++++++++++
 .../services/HDFS/package/scripts/params.py     |   1 +
 .../YARN/package/scripts/nodemanager.py         |  35 ++----
 .../YARN/package/scripts/nodemanager_upgrade.py |  74 ++++++++++++
 .../services/YARN/package/scripts/params.py     |  18 ++-
 .../stacks/HDP/2.2/upgrades/upgrade-2.2.xml     |  43 +------
 .../python/stacks/2.0.6/HDFS/test_datanode.py   |  74 +++++++++++-
 .../stacks/2.0.6/YARN/test_nodemanager.py       |  61 ++++++++++
 10 files changed, 371 insertions(+), 82 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/AlertsDAO.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/AlertsDAO.java b/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/AlertsDAO.java
index 47d26cb..797d759 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/AlertsDAO.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/AlertsDAO.java
@@ -341,10 +341,9 @@ public class AlertsDAO {
 
   /**
    * Retrieve the summary alert information for all hosts. This is different
-   * from {@link #findCurrentCounts(long, String, String)} since this will not
-   * return alerts that are not related to a particular host, such as aggregate
-   * alerts. In general, {@link #findCurrentCounts(long, String, String)} and
-   * this method will return very similar counts.
+   * from {@link #findCurrentCounts(long, String, String)} since this will
+   * return only alerts related to hosts and those values will be the total
+   * number of hosts affected, not the total number of alerts.
    *
    * @param clusterId
    *          the cluster id

http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/datanode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/datanode.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/datanode.py
index 327a457..4afd4ce 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/datanode.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/datanode.py
@@ -16,7 +16,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 
 """
-
+import datanode_upgrade
 from hdfs_datanode import datanode
 from resource_management import *
 from resource_management.libraries.functions.version import compare_versions, format_hdp_stack_version
@@ -30,14 +30,25 @@ class DataNode(Script):
     self.install_packages(env, params.exclude_packages)
     env.set_params(params)
 
+
   def pre_rolling_restart(self, env):
-    Logger.info("Executing Rolling Upgrade pre-restart")
+    Logger.info("Executing DataNode Rolling Upgrade pre-restart")
     import params
     env.set_params(params)
 
     if params.version and compare_versions(format_hdp_stack_version(params.version), '2.2.0.0') >= 0:
       Execute(format("hdp-select set hadoop-hdfs-datanode {version}"))
 
+
+  def post_rolling_restart(self, env):
+    Logger.info("Executing DataNode Rolling Upgrade post-restart")
+    import params
+    env.set_params(params)
+
+    # ensure the DataNode has started and rejoined the cluster
+    datanode_upgrade.post_upgrade_check()
+
+
   def start(self, env, rolling_restart=False):
     import params
 
@@ -45,11 +56,19 @@ class DataNode(Script):
     self.configure(env)
     datanode(action="start")
 
+
   def stop(self, env, rolling_restart=False):
     import params
 
     env.set_params(params)
-    datanode(action="stop")
+
+    # pre-upgrade steps shutdown the datanode, so there's no need to call
+    # action=stop
+    if rolling_restart:
+      datanode_upgrade.pre_upgrade_shutdown()
+    else:
+      datanode(action="stop")
+
 
   def configure(self, env):
     import params
@@ -57,6 +76,7 @@ class DataNode(Script):
     hdfs()
     datanode(action="configure")
 
+
   def status(self, env):
     import status_params
 

http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/datanode_upgrade.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/datanode_upgrade.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/datanode_upgrade.py
new file mode 100644
index 0000000..88af1f9
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/datanode_upgrade.py
@@ -0,0 +1,114 @@
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+"""
+
+from resource_management.core.logger import Logger
+from resource_management.core.exceptions import Fail
+from resource_management.core.resources.system import Execute
+from resource_management.core.shell import call
+from resource_management.libraries.functions import format
+from resource_management.libraries.functions.decorator import retry
+
+
+def pre_upgrade_shutdown():
+  """
+  Runs the "shutdownDatanode {ipc_address} upgrade" command to shutdown the
+  DataNode in preparation for an upgrade. This will then periodically check
+  "getDatanodeInfo" to ensure the DataNode has shutdown correctly.
+  This function will obtain the Kerberos ticket if security is enabled.
+  :return:
+  """
+  import params
+
+  Logger.info('DataNode executing "shutdownDatanode" command in preparation for upgrade...')
+  if params.security_enabled:
+    Execute(params.dn_kinit_cmd, user = params.hdfs_user)
+
+  command = format('hdfs dfsadmin -shutdownDatanode {dfs_dn_ipc_address} upgrade')
+  Execute(command, user=params.hdfs_user, tries=1 )
+
+  # verify that the datanode is down
+  _check_datanode_shutdown()
+
+
+def post_upgrade_check():
+  """
+  Verifies that the DataNode has rejoined the cluster. This function will
+  obtain the Kerberos ticket if security is enabled.
+  :return:
+  """
+  import params
+
+  Logger.info("Checking that the DataNode has rejoined the cluster after upgrade...")
+  if params.security_enabled:
+    Execute(params.dn_kinit_cmd,user = params.hdfs_user)
+
+  # verify that the datanode has started and rejoined the HDFS cluster
+  _check_datanode_startup()
+
+
+@retry(times=12, sleep_time=10, err_class=Fail)
+def _check_datanode_shutdown():
+  """
+  Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo"
+  several times, pausing in between runs. Once the DataNode stops responding
+  this method will return, otherwise it will raise a Fail(...) and retry
+  automatically.
+  :return:
+  """
+  import params
+
+  command = format('hdfs dfsadmin -getDatanodeInfo {dfs_dn_ipc_address}')
+
+  try:
+    Execute(command, user=params.hdfs_user, tries=1)
+  except:
+    Logger.info("DataNode has successfully shutdown for upgrade.")
+    return
+
+  Logger.info("DataNode has not shutdown.")
+  raise Fail('DataNode has not shutdown.')
+
+
+@retry(times=12, sleep_time=10, err_class=Fail)
+def _check_datanode_startup():
+  """
+  Checks that a DataNode is reported as being alive via the
+  "hdfs dfsadmin -report -live" command. Once the DataNode is found to be
+  alive this method will return, otherwise it will raise a Fail(...) and retry
+  automatically.
+  :return:
+  """
+  import params
+
+  try:
+    # 'su - hdfs -c "hdfs dfsadmin -report -live"'
+    command = 'hdfs dfsadmin -report -live'
+    return_code, hdfs_output = call(command, user=params.hdfs_user)
+  except:
+    raise Fail('Unable to determine if the DataNode has started after upgrade.')
+
+  if return_code == 0:
+    if params.hostname.lower() in hdfs_output.lower():
+      Logger.info("DataNode {0} reports that it has rejoined the cluster.".format(params.hostname))
+      return
+    else:
+      raise Fail("DataNode {0} was not found in the list of live DataNodes".format(params.hostname))
+
+  # return_code is not 0, fail
+  raise Fail("Unable to determine if the DataNode has started after upgrade (result code {0})".format(str(return_code)))

http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/params.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/params.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/params.py
index d88e16f..12353de 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/params.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/scripts/params.py
@@ -43,6 +43,7 @@ dfs_dn_addr = default('/configurations/hdfs-site/dfs.datanode.address', None)
 dfs_dn_http_addr = default('/configurations/hdfs-site/dfs.datanode.http.address', None)
 dfs_dn_https_addr = default('/configurations/hdfs-site/dfs.datanode.https.address', None)
 dfs_http_policy = default('/configurations/hdfs-site/dfs.http.policy', None)
+dfs_dn_ipc_address = config['configurations']['hdfs-site']['dfs.datanode.ipc.address']
 secure_dn_ports_are_in_use = False
 
 #hadoop params

http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/nodemanager.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/nodemanager.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/nodemanager.py
index 0fcf20d..f2cba8b 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/nodemanager.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/nodemanager.py
@@ -18,32 +18,16 @@ limitations under the License.
 Ambari Agent
 
 """
-import re
+
+import nodemanager_upgrade
 
 from resource_management import *
-from resource_management.libraries.functions.decorator import retry
 from resource_management.libraries.functions.version import compare_versions, format_hdp_stack_version
 from resource_management.libraries.functions.format import format
-from resource_management.core.shell import call
 
 from yarn import yarn
 from service import service
 
-
-@retry(times=10, sleep_time=2, err_class=Fail)
-def call_and_match_output(command, regex_expression, err_message):
-  """
-  Call the command and performs a regex match on the output for the specified expression.
-  :param command: Command to call
-  :param regex_expression: Regex expression to search in the output
-  """
-  # TODO Rolling Upgrade, does this work in Ubuntu? If it doesn't see dynamic_variable_interpretation.py to see how stdout was redirected
-  # to a temporary file, which was then read.
-  code, out = call(command, verbose=True)
-  if not (out and re.search(regex_expression, out, re.IGNORECASE)):
-    raise Fail(err_message)
-
-
 class Nodemanager(Script):
   def install(self, env):
     self.install_packages(env)
@@ -54,7 +38,7 @@ class Nodemanager(Script):
     yarn(name="nodemanager")
 
   def pre_rolling_restart(self, env):
-    Logger.info("Executing Rolling Upgrade post-restart")
+    Logger.info("Executing NodeManager Rolling Upgrade pre-restart")
     import params
     env.set_params(params)
 
@@ -65,25 +49,20 @@ class Nodemanager(Script):
     import params
     env.set_params(params)
     self.configure(env) # FOR SECURITY
-    service('nodemanager',
-            action='start'
-    )
+    service('nodemanager',action='start')
 
   def post_rolling_restart(self, env):
-    Logger.info("Executing Rolling Upgrade post-restart")
+    Logger.info("Executing NodeManager Rolling Upgrade post-restart")
     import params
     env.set_params(params)
 
-    nm_status_command = format("yarn node -status {nm_address}")
-    call_and_match_output(nm_status_command, 'Node-State : RUNNING',  "Failed to check NodeManager status")
+    nodemanager_upgrade.post_upgrade_check()
 
   def stop(self, env, rolling_restart=False):
     import params
     env.set_params(params)
 
-    service('nodemanager',
-            action='stop'
-    )
+    service('nodemanager',action='stop')
 
   def status(self, env):
     import status_params

http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/nodemanager_upgrade.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/nodemanager_upgrade.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/nodemanager_upgrade.py
new file mode 100644
index 0000000..e82c320
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/nodemanager_upgrade.py
@@ -0,0 +1,74 @@
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+"""
+
+import subprocess
+
+from resource_management.core.logger import Logger
+from resource_management.core.exceptions import Fail
+from resource_management.core.resources.system import Execute
+from resource_management.core.shell import call
+from resource_management.libraries.functions.decorator import retry
+
+
+def post_upgrade_check():
+  '''
+  Checks that the NodeManager has rejoined the cluster.
+  This function will obtain the Kerberos ticket if security is enabled.
+  :return:
+  '''
+  import params
+
+  Logger.info('NodeManager executing "yarn node -list -states=RUNNING" to verify the node has rejoined the cluster...')
+  if params.security_enabled and params.nodemanager_kinit_cmd:
+    Execute(params.nodemanager_kinit_cmd, user = params.yarn_user)
+
+  _check_nodemanager_startup()
+
+
+@retry(times=12, sleep_time=10, err_class=Fail)
+def _check_nodemanager_startup():
+  '''
+  Checks that a NodeManager is in a RUNNING state in the cluster via
+  "yarn node -list -states=RUNNING" command. Once the NodeManager is found to be
+  alive this method will return, otherwise it will raise a Fail(...) and retry
+  automatically.
+  :return:
+  '''
+  import params
+
+  command = 'yarn node -list -states=RUNNING'
+
+  try:
+    # 'su - yarn -c "yarn node -status c6401.ambari.apache.org:45454"'
+    return_code, yarn_output = call(command, user=params.hdfs_user)
+  except:
+    raise Fail('Unable to determine if the NodeManager has started after upgrade.')
+
+  if return_code == 0:
+    hostname = params.hostname.lower()
+    nodemanager_address = params.nm_address.lower()
+    yarn_output = yarn_output.lower()
+
+    if hostname in yarn_output or nodemanager_address in yarn_output:
+      Logger.info('NodeManager with ID {0} has rejoined the cluster.'.format(nodemanager_address))
+      return
+    else:
+      raise Fail('NodeManager with ID {0} was not found in the list of running NodeManagers'.format(nodemanager_address))
+
+  raise Fail('Unable to determine if the NodeManager has started after upgrade (result code {0})'.format(str(return_code)))

http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/params.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/params.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/params.py
index 2128687..67422d9 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/params.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/scripts/params.py
@@ -131,11 +131,15 @@ exclude_file_path = default("/configurations/yarn-site/yarn.resourcemanager.node
 ats_host = set(default("/clusterHostInfo/app_timeline_server_hosts", []))
 has_ats = not len(ats_host) == 0
 
+# default kinit commands
+rm_kinit_cmd = ""
+yarn_timelineservice_kinit_cmd = ""
+nodemanager_kinit_cmd = ""
+
 if security_enabled:
   _rm_principal_name = config['configurations']['yarn-site']['yarn.resourcemanager.principal']
-  _rm_keytab = config['configurations']['yarn-site']['yarn.resourcemanager.keytab']
   _rm_principal_name = _rm_principal_name.replace('_HOST',hostname.lower())
-  
+  _rm_keytab = config['configurations']['yarn-site']['yarn.resourcemanager.keytab']
   rm_kinit_cmd = format("{kinit_path_local} -kt {_rm_keytab} {_rm_principal_name};")
 
   # YARN timeline security options are only available in HDP Champlain
@@ -144,9 +148,12 @@ if security_enabled:
     _yarn_timelineservice_principal_name = _yarn_timelineservice_principal_name.replace('_HOST', hostname.lower())
     _yarn_timelineservice_keytab = config['configurations']['yarn-site']['yarn.timeline-service.keytab']
     yarn_timelineservice_kinit_cmd = format("{kinit_path_local} -kt {_yarn_timelineservice_keytab} {_yarn_timelineservice_principal_name};")
-else:
-  rm_kinit_cmd = ""
-  yarn_timelineservice_kinit_cmd = ""
+
+  if 'yarn.nodemanager.principal' in config['configurations']['yarn-site']:
+    _nodemanager_principal_name = config['configurations']['yarn-site']['yarn.nodemanager.principal']
+    _nodemanager_keytab = config['configurations']['yarn-site']['yarn.nodemanager.keytab']
+    nodemanager_kinit_cmd = format("{kinit_path_local} -kt {_nodemanager_keytab} {_nodemanager_principal_name};")
+
 
 yarn_log_aggregation_enabled = config['configurations']['yarn-site']['yarn.log-aggregation-enable']
 yarn_nm_app_log_dir =  config['configurations']['yarn-site']['yarn.nodemanager.remote-app-log-dir']
@@ -155,7 +162,6 @@ mapreduce_jobhistory_done_dir = config['configurations']['mapred-site']['mapredu
 jobhistory_heapsize = default("/configurations/mapred-env/jobhistory_heapsize", "900")
 
 #for create_hdfs_directory
-hostname = config["hostname"]
 hdfs_user_keytab = config['configurations']['hadoop-env']['hdfs_user_keytab']
 hdfs_principal_name = config['configurations']['hadoop-env']['hdfs_principal_name']
 import functools

http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/upgrade-2.2.xml
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/upgrade-2.2.xml b/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/upgrade-2.2.xml
index 4e6860c..e9546ee 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/upgrade-2.2.xml
+++ b/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/upgrade-2.2.xml
@@ -97,7 +97,6 @@
   
   
   <processing>
-  
     <service name="ZOOKEEPER">
       <component name="ZOOKEEPER_SERVER">
         <!-- TODO, optimization
@@ -130,7 +129,7 @@
 
     <service name="HDFS">
       <component name="NAMENODE">
-        
+
         <pre-upgrade>
           <!-- Backup the image,
           Enter Safemode if not already in it,
@@ -183,10 +182,10 @@
             <every>1</every>
           </task>
         </pre-upgrade>
-        
+
         <upgrade>
           <task xsi:type="restart" />
-        </upgrade>        
+        </upgrade>
 
         <!-- This step should be done once the user clicks on the "Finalize" button. So the name post-upgrade is misleading. -->
         <post-upgrade>
@@ -200,42 +199,6 @@
       </component>
 
       <component name="DATANODE">
-        <pre-upgrade>
-          <!-- Shutdown the datanode,
-
-          Will retry 50 times.
-          Property dfs.datanode.ipc.address = 0.0.0.0:8010 needs to evaluate to current host.
-          $ su hdfs -c 'hdfs dfsadmin -shutdownDatanode <DATANODE_HOST:IPC_PORT> upgrade'
-          E.g.,
-          $ su hdfs -c 'hdfs dfsadmin -shutdownDatanode c6407.ambari.apache.org:8010 upgrade'
-
-          Will retry 50 times.
-          $ su hdfs -c 'hdfs dfsadmin -getDatanodeInfo c6407.ambari.apache.org:8010'
-          Datanode unreachable.
-
-          Change the version,
-          $ hdp-select set hadoop-hdfs-datanode 2.2.0.1-885
-
-          Start the datanode,
-          $ su - hdfs -c '/usr/hdp/current/hadoop-hdfs-datanode/../hadoop/sbin/hadoop-daemon.sh start datanode'
-          starting datanode, logging to /var/log/hadoop/hdfs/hadoop-hdfs-datanode-c6407.ambari.apache.org.out
-
-          Verify it is live,
-          $ su - hdfs -c 'hdfs dfsadmin -report -live'
-          Live datanodes (1):
-          -->
-          <task xsi:type="execute">
-            <first>su {{hadoop-env/hdfs_user}} -c 'hdfs dfsadmin -getDatanodeInfo {{hdfs-site/dfs.datanode.ipc.address}}'</first>
-            <command>su {{hadoop-env/hdfs_user}} -c 'hdfs dfsadmin -shutdownDatanode {{hdfs-site/dfs.datanode.ipc.address}} upgrade'</command>
-          </task>
-
-          <!-- After shutting down the datanode, this command is expected to fail with 255, so ignore only that return code. -->
-          <task xsi:type="execute">
-            <command>su {{hadoop-env/hdfs_user}} -c 'hdfs dfsadmin -getDatanodeInfo {{hdfs-site/dfs.datanode.ipc.address}}'</command>
-            <ignore>255</ignore>
-          </task>
-        </pre-upgrade>
-
         <upgrade>
           <task xsi:type="restart" />
         </upgrade>

http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
index e661f2c..3bbf1e5 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
@@ -18,9 +18,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 '''
 from stacks.utils.RMFTestCase import *
-from ambari_commons import OSCheck
 import json
 from mock.mock import MagicMock, patch
+from resource_management.core.exceptions import Fail
 
 class TestDatanode(RMFTestCase):
 
@@ -416,3 +416,75 @@ class TestDatanode(RMFTestCase):
                               recursive = True,
                               recursive_permission = True
                               )
+
+
+  @patch('time.sleep')
+  @patch("subprocess.Popen")
+  def test_post_rolling_restart(self, process_mock, time_mock):
+    process_output = """
+      Live datanodes (2):
+
+      Name: 192.168.64.102:50010 (c6401.ambari.apache.org)
+      Hostname: c6401.ambari.apache.org
+      Decommission Status : Normal
+      Configured Capacity: 524208947200 (488.21 GB)
+      DFS Used: 193069056 (184.13 MB)
+      Non DFS Used: 29264986112 (27.26 GB)
+      DFS Remaining: 494750892032 (460.77 GB)
+      DFS Used%: 0.04%
+      DFS Remaining%: 94.38%
+      Configured Cache Capacity: 0 (0 B)
+      Cache Used: 0 (0 B)
+      Cache Remaining: 0 (0 B)
+      Cache Used%: 100.00%
+      Cache Remaining%: 0.00%
+      Xceivers: 2
+      Last contact: Fri Dec 12 20:47:21 UTC 2014
+    """
+
+    process = MagicMock()
+    process.communicate.return_value = [process_output]
+    process.returncode = 0
+    process_mock.return_value = process
+
+    self.executeScript("2.0.6/services/HDFS/package/scripts/datanode.py",
+      classname = "DataNode", command = "post_rolling_restart", config_file="default.json",  )
+
+    self.assertTrue(process_mock.called)
+    self.assertEqual(process_mock.call_count,1)
+
+
+  @patch('time.sleep')
+  @patch("subprocess.Popen")
+  def test_post_rolling_restart_datanode_not_ready(self, process_mock, time_mock):
+    process = MagicMock()
+    process.communicate.return_value = ['There are no DataNodes here!']
+    process.returncode = 0
+    process_mock.return_value = process
+
+    try:
+      self.executeScript("2.0.6/services/HDFS/package/scripts/datanode.py",
+        classname = "DataNode", command = "post_rolling_restart", config_file="default.json",  )
+      self.fail('Missing DataNode should have caused a failure')
+    except Fail,fail:
+      self.assertTrue(process_mock.called)
+      self.assertEqual(process_mock.call_count,12)
+
+
+  @patch('time.sleep')
+  @patch("subprocess.Popen")
+  def test_post_rolling_restart_bad_returncode(self, process_mock, time_mock):
+    process = MagicMock()
+    process.communicate.return_value = ['some']
+    process.returncode = 999
+    process_mock.return_value = process
+
+    try:
+      self.executeScript("2.0.6/services/HDFS/package/scripts/datanode.py",
+        classname = "DataNode", command = "post_rolling_restart", config_file="default.json",  )
+      self.fail('Invalid return code should cause a failure')
+    except Fail,fail:
+      self.assertTrue(process_mock.called)
+      self.assertEqual(process_mock.call_count,12)
+
+

http://git-wip-us.apache.org/repos/asf/ambari/blob/5b0ccc01/ambari-server/src/test/python/stacks/2.0.6/YARN/test_nodemanager.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/YARN/test_nodemanager.py b/ambari-server/src/test/python/stacks/2.0.6/YARN/test_nodemanager.py
index 44a9bad..d4229dc 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/YARN/test_nodemanager.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/YARN/test_nodemanager.py
@@ -19,6 +19,7 @@ limitations under the License.
 '''
 from mock.mock import MagicMock, call, patch
 from stacks.utils.RMFTestCase import *
+from resource_management.core.exceptions import Fail
 import os
 
 origin_exists = os.path.exists
@@ -556,3 +557,63 @@ class TestNodeManager(RMFTestCase):
                               owner = 'mapred',
                               group = 'hadoop',
                               )
+
+  @patch('time.sleep')
+  @patch("subprocess.Popen")
+  def test_post_rolling_restart(self, process_mock, time_mock):
+    process_output = """
+      c6401.ambari.apache.org:45454  RUNNING  c6401.ambari.apache.org:8042  0
+    """
+
+    process = MagicMock()
+    process.communicate.return_value = [process_output]
+    process.returncode = 0
+    process_mock.return_value = process
+
+    self.executeScript("2.0.6/services/YARN/package/scripts/nodemanager.py",
+      classname="Nodemanager", command = "post_rolling_restart", config_file="default.json")
+
+    self.assertTrue(process_mock.called)
+    self.assertEqual(process_mock.call_count,1)
+
+
+  @patch('time.sleep')
+  @patch("subprocess.Popen")
+  def test_post_rolling_restart_nodemanager_not_ready(self, process_mock, time_mock):
+    process_output = """
+      c9999.ambari.apache.org:45454  RUNNING  c9999.ambari.apache.org:8042  0
+    """
+
+    process = MagicMock()
+    process.communicate.return_value = [process_output]
+    process.returncode = 0
+    process_mock.return_value = process
+
+    try:
+      self.executeScript("2.0.6/services/YARN/package/scripts/nodemanager.py",
+        classname="Nodemanager", command = "post_rolling_restart", config_file="default.json")
+      self.fail('Missing NodeManager should have caused a failure')
+    except Fail,fail:
+      self.assertTrue(process_mock.called)
+      self.assertEqual(process_mock.call_count,12)
+
+
+  @patch('time.sleep')
+  @patch("subprocess.Popen")
+  def test_post_rolling_restart_nodemanager_not_ready(self, process_mock, time_mock):
+    process_output = """
+      c6401.ambari.apache.org:45454  RUNNING  c6401.ambari.apache.org:8042  0
+    """
+
+    process = MagicMock()
+    process.communicate.return_value = [process_output]
+    process.returncode = 999
+    process_mock.return_value = process
+
+    try:
+      self.executeScript("2.0.6/services/YARN/package/scripts/nodemanager.py",
+        classname="Nodemanager", command = "post_rolling_restart", config_file="default.json")
+      self.fail('Invalid return code should cause a failure')
+    except Fail,fail:
+      self.assertTrue(process_mock.called)
+      self.assertEqual(process_mock.call_count,12)