You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@aurora.apache.org by ma...@apache.org on 2014/08/23 01:49:00 UTC

git commit: Refactoring maintenance commands to host_ prefix and converting perform_maintenance_hosts into host_drain.

Repository: incubator-aurora
Updated Branches:
  refs/heads/master 900a2b299 -> 3f8b5ea73


Refactoring maintenance commands to host_ prefix
and converting perform_maintenance_hosts into host_drain.

Bugs closed: AURORA-43

Reviewed at https://reviews.apache.org/r/24871/


Project: http://git-wip-us.apache.org/repos/asf/incubator-aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-aurora/commit/3f8b5ea7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-aurora/tree/3f8b5ea7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-aurora/diff/3f8b5ea7

Branch: refs/heads/master
Commit: 3f8b5ea73340a33b5d3c285ae58d648aeecf2222
Parents: 900a2b2
Author: Maxim Khutornenko <ma...@apache.org>
Authored: Fri Aug 22 16:48:37 2014 -0700
Committer: Maxim Khutornenko <ma...@apache.org>
Committed: Fri Aug 22 16:48:37 2014 -0700

----------------------------------------------------------------------
 .../python/apache/aurora/admin/admin_util.py    |  94 +++++++++++--
 .../apache/aurora/admin/host_maintenance.py     |  41 ++----
 .../aurora/client/commands/maintenance.py       | 135 +++++++++----------
 src/test/python/apache/aurora/admin/BUILD       |   1 +
 .../aurora/admin/test_host_maintenance.py       |  41 +-----
 .../aurora/client/commands/test_maintenance.py  | 107 ++++-----------
 6 files changed, 198 insertions(+), 221 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/3f8b5ea7/src/main/python/apache/aurora/admin/admin_util.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/admin/admin_util.py b/src/main/python/apache/aurora/admin/admin_util.py
index 31ec2b1..394deb5 100644
--- a/src/main/python/apache/aurora/admin/admin_util.py
+++ b/src/main/python/apache/aurora/admin/admin_util.py
@@ -19,6 +19,9 @@ import os
 import subprocess
 from uuid import uuid1
 
+from twitter.common.quantity import Amount, Time
+from twitter.common.quantity.parse_simple import parse_time
+
 from apache.aurora.client.base import die
 
 """Admin client utility functions shared between admin and maintenance modules."""
@@ -28,6 +31,10 @@ LOGGER_NAME = 'aurora_admin'
 logger = logging.getLogger(LOGGER_NAME)
 CLIENT_ID = uuid1()
 
+# Default SLA limits
+SLA_UPTIME_PERCENTAGE_LIMIT = 95
+SLA_UPTIME_DURATION_LIMIT = Amount(30, Time.MINUTES)
+
 
 def log_admin_message(sev, msg, *args, **kwargs):
   """Logs message using the module-defined logger.
@@ -46,17 +53,55 @@ def log_admin_message(sev, msg, *args, **kwargs):
 
 
 FILENAME_OPTION = optparse.Option(
-  '--filename',
-  dest='filename',
-  default=None,
-  help='Name of the file with hostnames')
+    '--filename',
+    dest='filename',
+    default=None,
+    help='Name of the file with hostnames')
 
 
 HOSTS_OPTION = optparse.Option(
-  '--hosts',
-  dest='hosts',
-  default=None,
-  help='Comma separated list of hosts')
+    '--hosts',
+    dest='hosts',
+    default=None,
+    help='Comma separated list of hosts')
+
+
+POST_DRAIN_SCRIPT_OPTION = optparse.Option(
+    '--post_drain_script',
+    dest='post_drain_script',
+    default=None,
+    help='Path to a script to run for each host if needed.')
+
+OVERRIDE_SLA_PERCENTAGE_OPTION = optparse.Option(
+    '--override_percentage',
+    dest='percentage',
+    default=None,
+    help='Percentage of tasks required to be up all the time within the duration. '
+         'Default value:%s. DO NOT override default value unless absolutely necessary! '
+         'See sla_probe_hosts and sla_list_safe_domain commands '
+         'for more details on SLA.' % SLA_UPTIME_PERCENTAGE_LIMIT)
+
+OVERRIDE_SLA_DURATION_OPTION = optparse.Option(
+    '--override_duration',
+    dest='duration',
+    default=None,
+    help='Time interval (now - value) for the percentage of up tasks. Format: XdYhZmWs. '
+         'Default value:%s. DO NOT override default value unless absolutely necessary! '
+         'See sla_probe_hosts and sla_list_safe_domain commands '
+         'for more details on SLA.' % SLA_UPTIME_DURATION_LIMIT)
+
+OVERRIDE_SLA_REASON_OPTION = optparse.Option(
+    '--override_reason',
+    dest='reason',
+    default=None,
+    help='Reason for overriding default SLA values. Provide details including the '
+         'maintenance ticket number.')
+
+UNSAFE_SLA_HOSTS_FILE_OPTION = optparse.Option(
+    '--unsafe_hosts_file',
+    dest='unsafe_hosts_filename',
+    default=None,
+    help='Output file to write host names that did not pass SLA check.')
 
 
 def parse_sla_percentage(percentage):
@@ -149,6 +194,39 @@ def parse_script(filename):
     return None
 
 
+def parse_and_validate_sla_overrides(options, hostnames):
+  """Parses and validates host SLA override 3-tuple (percentage, duration, reason).
+
+  In addition, logs an admin message about overriding default SLA values.
+
+  :param options: command line options
+  :type options: list of app.option
+  :param hostnames: host names override is issued to
+  :type hostnames: list of string
+  :rtype: a tuple of: override percentage (float) and override duration (Amount)
+  """
+  has_override = bool(options.percentage) or bool(options.duration) or bool(options.reason)
+  all_overrides = bool(options.percentage) and bool(options.duration) and bool(options.reason)
+  if has_override != all_overrides:
+    die('All --override_* options are required when attempting to override default SLA values.')
+
+  percentage = parse_sla_percentage(options.percentage) if options.percentage else None
+  duration = parse_time(options.duration) if options.duration else None
+  if options.reason:
+    log_admin_message(
+      logging.WARNING,
+      'Default SLA values (percentage: %s, duration: %s) are overridden for the following '
+      'hosts: %s. New percentage: %s, duration: %s, override reason: %s' % (
+        SLA_UPTIME_PERCENTAGE_LIMIT,
+        SLA_UPTIME_DURATION_LIMIT,
+        hostnames,
+        percentage,
+        duration,
+        options.reason))
+
+  return percentage or SLA_UPTIME_PERCENTAGE_LIMIT, duration or SLA_UPTIME_DURATION_LIMIT
+
+
 def print_results(results):
   """Prints formatted SLA results.
 

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/3f8b5ea7/src/main/python/apache/aurora/admin/host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/admin/host_maintenance.py b/src/main/python/apache/aurora/admin/host_maintenance.py
index 162f9af..29745a3 100644
--- a/src/main/python/apache/aurora/admin/host_maintenance.py
+++ b/src/main/python/apache/aurora/admin/host_maintenance.py
@@ -40,8 +40,6 @@ class HostMaintenance(object):
   START_MAINTENANCE_DELAY = Amount(30, Time.SECONDS)
 
   SLA_MIN_JOB_INSTANCE_COUNT = 20
-  SLA_UPTIME_PERCENTAGE_LIMIT = 95
-  SLA_UPTIME_DURATION_LIMIT = Amount(30, Time.MINUTES)
 
 
   @classmethod
@@ -92,18 +90,7 @@ class HostMaintenance(object):
       if host_status.mode != MaintenanceMode.NONE:
         log.warning('%s is DRAINING or in DRAINED' % host_status.host)
 
-  def _operate_on_hosts(self, drained_hosts, callback):
-    """Perform a given operation on a list of hosts that are ready for maintenance.
-
-    :param drained_hosts: Hosts that have been drained (via _drain_hosts)
-    :type drained_hosts: gen.apache.aurora.ttypes.Hosts
-    :param callback: Function to call one hostname at a time
-    :type callback: function
-    """
-    for hostname in drained_hosts.hostNames:
-      callback(hostname)
-
-  def _check_sla(self, hostnames, grouping_function, percentage=None, duration=None):
+  def _check_sla(self, hostnames, grouping_function, percentage, duration):
     """Check if the provided list of hosts passes the job uptime SLA check.
 
     This is an all-or-nothing check, meaning that all provided hosts must pass their job
@@ -119,13 +106,10 @@ class HostMaintenance(object):
     :type duration: twitter.common.quantity.Amount
     :rtype: set of unsafe hosts
     """
-    sla_percentage = percentage or self.SLA_UPTIME_PERCENTAGE_LIMIT
-    sla_duration = duration or self.SLA_UPTIME_DURATION_LIMIT
-
     vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
     host_groups = vector.probe_hosts(
-      sla_percentage,
-      sla_duration.as_(Time.SECONDS),
+      percentage,
+      duration.as_(Time.SECONDS),
       grouping_function)
 
     unsafe_hostnames = set()
@@ -170,25 +154,24 @@ class HostMaintenance(object):
     return result
 
   def perform_maintenance(self, hostnames, grouping_function=DEFAULT_GROUPING,
-                          callback=None, percentage=None, duration=None, output_file=None):
-    """Wrap a callback in between sending hosts into maintenance mode and back.
+                          percentage=None, duration=None, output_file=None):
+    """Put hosts into maintenance mode and drain them.
+
+    Walk through the process of putting hosts into maintenance and draining them of tasks. The hosts
+    will remain in maintenance mode upon completion.
 
-    Walk through the process of putting hosts into maintenance, draining them of tasks,
-    performing an action on them once drained, then removing them from maintenance mode
-    so tasks can schedule.
 
     :param hostnames: A list of hostnames to operate upon
     :type hostnames: list of strings
     :param grouping_function: How to split up the hostname into groups
     :type grouping_function: function
-    :param callback: Function to call once hosts are drained
-    :type callback: function
     :param percentage: SLA percentage to use
     :type percentage: float
     :param duration: SLA duration to use
     :type duration: twitter.common.quantity.Time
     :param output_file: file to write hosts that were not drained due to failed SLA check
     :type output_file: string
+    :rtype: set of host names that were successfully drained
     """
     hostnames = self.start_maintenance(hostnames)
     not_drained_hostnames = set()
@@ -204,7 +187,6 @@ class HostMaintenance(object):
       if unsafe_hostnames:
         log.warning('Some hosts did not pass SLA check and will not be drained! '
                     'Skipping hosts: %s' % unsafe_hostnames)
-        self._complete_maintenance(Hosts(unsafe_hostnames))
         not_drained_hostnames |= unsafe_hostnames
         drainable_hostnames = hosts.hostNames - unsafe_hostnames
         if not drainable_hostnames:
@@ -214,9 +196,6 @@ class HostMaintenance(object):
         log.info('All hosts passed SLA check.')
 
       self._drain_hosts(hosts)
-      if callback:
-        self._operate_on_hosts(hosts, callback)
-      self._complete_maintenance(hosts)
 
     if not_drained_hostnames:
       output = '\n'.join(list(not_drained_hostnames))
@@ -231,6 +210,8 @@ class HostMaintenance(object):
         except IOError as e:
           log.error('Failed to write into the output file: %s' % e)
 
+    return set(hostnames) - not_drained_hostnames
+
   def check_status(self, hostnames):
     """Query the scheduler to determine the maintenance status for a list of hostnames
 

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/3f8b5ea7/src/main/python/apache/aurora/client/commands/maintenance.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/client/commands/maintenance.py b/src/main/python/apache/aurora/client/commands/maintenance.py
index a54f0f6..c83b96a 100644
--- a/src/main/python/apache/aurora/client/commands/maintenance.py
+++ b/src/main/python/apache/aurora/client/commands/maintenance.py
@@ -15,15 +15,18 @@
 import logging
 
 from twitter.common import app, log
-from twitter.common.quantity.parse_simple import parse_time
 
 from apache.aurora.admin.admin_util import (
     FILENAME_OPTION,
     HOSTS_OPTION,
-    log_admin_message,
+    OVERRIDE_SLA_DURATION_OPTION,
+    OVERRIDE_SLA_PERCENTAGE_OPTION,
+    OVERRIDE_SLA_REASON_OPTION,
+    parse_and_validate_sla_overrides,
     parse_hostnames,
     parse_script,
-    parse_sla_percentage
+    POST_DRAIN_SCRIPT_OPTION,
+    UNSAFE_SLA_HOSTS_FILE_OPTION
 )
 from apache.aurora.admin.host_maintenance import HostMaintenance
 from apache.aurora.client.base import die, get_grouping_or_die, GROUPING_OPTION, requires
@@ -34,9 +37,18 @@ from apache.aurora.common.clusters import CLUSTERS
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
 @requires.exactly('cluster')
-def start_maintenance_hosts(cluster):
-  """usage: start_maintenance_hosts {--filename=filename | --hosts=hosts}
-                                    cluster
+def host_deactivate(cluster):
+  """usage: host_deactivate {--filename=filename | --hosts=hosts}
+                            cluster
+
+  Puts hosts into maintenance mode.
+
+  The list of hosts is marked for maintenance, and will be de-prioritized
+  from consideration for scheduling.  Note, they are not removed from
+  consideration, and may still schedule tasks if resources are very scarce.
+  Usually you would mark a larger set of machines for drain, and then do
+  them in batches within the larger set, to help drained tasks not land on
+  future hosts that will be drained shortly in subsequent batches.
   """
   options = app.get_options()
   HostMaintenance(CLUSTERS[cluster], options.verbosity).start_maintenance(
@@ -47,9 +59,14 @@ def start_maintenance_hosts(cluster):
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
 @requires.exactly('cluster')
-def end_maintenance_hosts(cluster):
-  """usage: end_maintenance_hosts {--filename=filename | --hosts=hosts}
-                                  cluster
+def host_activate(cluster):
+  """usage: host_activate {--filename=filename | --hosts=hosts}
+                          cluster
+
+  Removes maintenance mode from hosts.
+
+  The list of hosts is marked as not in a drained state anymore. This will
+  allow normal scheduling to resume on the given list of hosts.
   """
   options = app.get_options()
   HostMaintenance(CLUSTERS[cluster], options.verbosity).end_maintenance(
@@ -57,84 +74,66 @@ def end_maintenance_hosts(cluster):
 
 
 @app.command
-@app.command_option('--post_drain_script', dest='post_drain_script', default=None,
-    help='Path to a script to run for each host.')
-@app.command_option('--override_percentage', dest='percentage', default=None,
-    help='Percentage of tasks required to be up all the time within the duration. '
-         'Default value:%s. DO NOT override default value unless absolutely necessary! '
-         'See sla_probe_hosts and sla_list_safe_domain commands '
-         'for more details on SLA.' % HostMaintenance.SLA_UPTIME_PERCENTAGE_LIMIT)
-@app.command_option('--override_duration', dest='duration', default=None,
-    help='Time interval (now - value) for the percentage of up tasks. Format: XdYhZmWs. '
-         'Default value:%s. DO NOT override default value unless absolutely necessary! '
-         'See sla_probe_hosts and sla_list_safe_domain commands '
-         'for more details on SLA.' % HostMaintenance.SLA_UPTIME_DURATION_LIMIT)
-@app.command_option('--override_reason', dest='reason', default=None,
-    help='Reason for overriding default SLA values. Provide details including the '
-         'maintenance ticket number.')
-@app.command_option('--unsafe_hosts_file', dest='unsafe_hosts_filename', default=None,
-    help='Output file to write host names that did not pass SLA check.')
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
+@app.command_option(POST_DRAIN_SCRIPT_OPTION)
 @app.command_option(GROUPING_OPTION)
+@app.command_option(OVERRIDE_SLA_PERCENTAGE_OPTION)
+@app.command_option(OVERRIDE_SLA_DURATION_OPTION)
+@app.command_option(OVERRIDE_SLA_REASON_OPTION)
+@app.command_option(UNSAFE_SLA_HOSTS_FILE_OPTION)
 @requires.exactly('cluster')
-def perform_maintenance_hosts(cluster):
-  """usage: perform_maintenance_hosts {--filename=filename | --hosts=hosts}
-                                      [--post_drain_script=path]
-                                      [--grouping=function]
-                                      [--override_percentage=percentage]
-                                      [--override_duration=duration]
-                                      [--override_reason=reason]
-                                      [--unsafe_hosts_file=unsafe_hosts_filename]
-                                      cluster
-
-  Asks the scheduler to remove any running tasks from the machine and remove it
-  from service temporarily, perform some action on them, then return the machines
-  to service.
+def host_drain(cluster):
+  """usage: host_drain {--filename=filename | --hosts=hosts}
+                       [--post_drain_script=path]
+                       [--grouping=function]
+                       [--override_percentage=percentage]
+                       [--override_duration=duration]
+                       [--override_reason=reason]
+                       [--unsafe_hosts_file=unsafe_hosts_filename]
+                       cluster
+
+  Asks the scheduler to start maintenance on the list of provided hosts (see host_deactivate
+  for more details) and drains any active tasks on them.
+
+  The list of hosts is drained and marked in a drained state.  This will kill
+  off any tasks currently running on these hosts, as well as prevent future
+  tasks from scheduling on these hosts while they are drained.
+
+  The hosts are left in maintenance mode upon completion. Use host_activate to
+  return hosts back to service and allow scheduling tasks on them.
   """
   options = app.get_options()
   drainable_hosts = parse_hostnames(options.filename, options.hosts)
   get_grouping_or_die(options.grouping)
 
-  has_override = bool(options.percentage) or bool(options.duration) or bool(options.reason)
-  all_overrides = bool(options.percentage) and bool(options.duration) and bool(options.reason)
-  if has_override != all_overrides:
-    die('All --override_* options are required when attempting to override default SLA values.')
-
-  percentage = parse_sla_percentage(options.percentage) if options.percentage else None
-  duration = parse_time(options.duration) if options.duration else None
-  if options.reason:
-    log_admin_message(
-        logging.WARNING,
-        'Default SLA values (percentage: %s, duration: %s) are overridden for the following '
-        'hosts: %s. New percentage: %s, duration: %s, override reason: %s' % (
-            HostMaintenance.SLA_UPTIME_PERCENTAGE_LIMIT,
-            HostMaintenance.SLA_UPTIME_DURATION_LIMIT,
-            drainable_hosts,
-            percentage,
-            duration,
-            options.reason))
-
-  drained_callback = parse_script(options.post_drain_script)
-
-  HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance(
+  override_percentage, override_duration = parse_and_validate_sla_overrides(
+      options,
+      drainable_hosts)
+
+  post_drain_callback = parse_script(options.post_drain_script)
+
+  drained_hostnames = HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance(
       drainable_hosts,
       grouping_function=options.grouping,
-      callback=drained_callback,
-      percentage=percentage,
-      duration=duration,
+      percentage=override_percentage,
+      duration=override_duration,
       output_file=options.unsafe_hosts_filename)
 
+  if post_drain_callback:
+    for hostname in drained_hostnames:
+      post_drain_callback(hostname)
+
 
 @app.command
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
 @requires.exactly('cluster')
-def host_maintenance_status(cluster):
-  """usage: host_maintenance_status {--filename=filename | --hosts=hosts}
-                                    cluster
+def host_status(cluster):
+  """usage: host_status {--filename=filename | --hosts=hosts}
+                        cluster
 
-  Check on the schedulers maintenance status for a list of hosts in the cluster.
+  Print the drain status of each supplied host.
   """
   options = app.get_options()
   checkable_hosts = parse_hostnames(options.filename, options.hosts)

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/3f8b5ea7/src/test/python/apache/aurora/admin/BUILD
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/admin/BUILD b/src/test/python/apache/aurora/admin/BUILD
index f423e63..5e170d6 100644
--- a/src/test/python/apache/aurora/admin/BUILD
+++ b/src/test/python/apache/aurora/admin/BUILD
@@ -36,6 +36,7 @@ python_tests(name = 'admin_util',
   sources = ['test_admin_util.py'],
   dependencies = [
     pants('3rdparty/python:mock'),
+    pants('3rdparty/python:twitter.common.quantity'),
     pants('3rdparty/python:twitter.common.contextutil'),
     pants('src/main/python/apache/aurora/admin:util'),
   ],

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/3f8b5ea7/src/test/python/apache/aurora/admin/test_host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/admin/test_host_maintenance.py b/src/test/python/apache/aurora/admin/test_host_maintenance.py
index d8aeffb..f2c5b7f 100644
--- a/src/test/python/apache/aurora/admin/test_host_maintenance.py
+++ b/src/test/python/apache/aurora/admin/test_host_maintenance.py
@@ -117,13 +117,6 @@ class TestHostMaintenance(unittest.TestCase):
     mock_maintenance_status.assert_called_once_with(test_hosts)
     mock_warning.assert_called_once_with('%s is DRAINING or in DRAINED' % TEST_HOSTNAMES[2])
 
-  def test_operate_on_hosts(self):
-    mock_callback = mock.Mock()
-    test_hosts = Hosts(TEST_HOSTNAMES)
-    maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
-    maintenance._operate_on_hosts(test_hosts, mock_callback)
-    assert mock_callback.call_count == 3
-
   @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._complete_maintenance",
     spec=HostMaintenance._complete_maintenance)
   def test_end_maintenance(self, mock_complete_maintenance):
@@ -140,39 +133,23 @@ class TestHostMaintenance(unittest.TestCase):
     maintenance.start_maintenance(TEST_HOSTNAMES)
     mock_api.assert_called_once_with(Hosts(set(TEST_HOSTNAMES)))
 
-  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._complete_maintenance",
-    spec=HostMaintenance._complete_maintenance)
-  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._operate_on_hosts",
-    spec=HostMaintenance._operate_on_hosts)
   @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._drain_hosts",
     spec=HostMaintenance._drain_hosts)
   @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance.start_maintenance",
     spec=HostMaintenance.start_maintenance)
   @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._check_sla",
     spec=HostMaintenance._check_sla)
-  def test_perform_maintenance(self, mock_check_sla, mock_start_maintenance,
-      mock_drain_hosts, mock_operate_on_hosts, mock_complete_maintenance):
-    mock_callback = mock.Mock()
+  def test_perform_maintenance(self, mock_check_sla, mock_start_maintenance, mock_drain_hosts):
     mock_check_sla.return_value = set()
     mock_start_maintenance.return_value = TEST_HOSTNAMES
     maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
-    maintenance.perform_maintenance(TEST_HOSTNAMES, callback=mock_callback)
+    maintenance.perform_maintenance(TEST_HOSTNAMES)
     mock_start_maintenance.assert_called_once_with(TEST_HOSTNAMES)
     assert mock_check_sla.call_count == 3
     assert mock_drain_hosts.call_count == 3
     assert mock_drain_hosts.call_args_list == [
         mock.call(Hosts(set([hostname]))) for hostname in TEST_HOSTNAMES]
-    assert mock_operate_on_hosts.call_count == 3
-    assert mock_operate_on_hosts.call_args_list == [
-        mock.call(Hosts(set([hostname])), mock_callback) for hostname in TEST_HOSTNAMES]
-    assert mock_complete_maintenance.call_count == 3
-    assert mock_complete_maintenance.call_args_list == [
-        mock.call(Hosts(set([hostname]))) for hostname in TEST_HOSTNAMES]
 
-  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._complete_maintenance",
-              spec=HostMaintenance._complete_maintenance)
-  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._operate_on_hosts",
-              spec=HostMaintenance._operate_on_hosts)
   @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._drain_hosts",
               spec=HostMaintenance._drain_hosts)
   @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance.start_maintenance",
@@ -180,8 +157,7 @@ class TestHostMaintenance(unittest.TestCase):
   @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._check_sla",
               spec=HostMaintenance._check_sla)
   def test_perform_maintenance_partial_sla_failure(self, mock_check_sla, mock_start_maintenance,
-                               mock_drain_hosts, mock_operate_on_hosts, mock_complete_maintenance):
-    mock_callback = mock.Mock()
+                               mock_drain_hosts):
     failed_host = 'us-west-001.example.com'
     mock_check_sla.return_value = set([failed_host])
     mock_start_maintenance.return_value = TEST_HOSTNAMES
@@ -190,9 +166,8 @@ class TestHostMaintenance(unittest.TestCase):
 
     with temporary_file() as fp:
       with group_by_rack():
-        maintenance.perform_maintenance(
+        drained = maintenance.perform_maintenance(
             TEST_HOSTNAMES,
-            callback=mock_callback,
             grouping_function='by_rack',
             output_file=fp.name)
 
@@ -201,15 +176,11 @@ class TestHostMaintenance(unittest.TestCase):
           assert failed_host in content
 
         mock_start_maintenance.assert_called_once_with(TEST_HOSTNAMES)
+        assert len(drained) == 2
+        assert failed_host not in drained
         assert mock_check_sla.call_count == 1
         assert mock_drain_hosts.call_count == 1
         assert mock_drain_hosts.call_args_list == [mock.call(Hosts(drained_hosts))]
-        assert mock_operate_on_hosts.call_count == 1
-        assert mock_operate_on_hosts.call_args_list == [
-            mock.call(Hosts(drained_hosts), mock_callback)]
-        assert mock_complete_maintenance.call_count == 2
-        assert mock_complete_maintenance.call_args_list == [
-            mock.call(Hosts(set([failed_host]))), mock.call(Hosts(drained_hosts))]
 
   @mock.patch("apache.aurora.client.api.AuroraClientAPI.maintenance_status",
       spec=AuroraClientAPI.maintenance_status)

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/3f8b5ea7/src/test/python/apache/aurora/client/commands/test_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/commands/test_maintenance.py b/src/test/python/apache/aurora/client/commands/test_maintenance.py
index a748c01..2151f02 100644
--- a/src/test/python/apache/aurora/client/commands/test_maintenance.py
+++ b/src/test/python/apache/aurora/client/commands/test_maintenance.py
@@ -18,10 +18,10 @@ from mock import Mock, patch
 from twitter.common.contextutil import temporary_file
 
 from apache.aurora.client.commands.maintenance import (
-    end_maintenance_hosts,
-    host_maintenance_status,
-    perform_maintenance_hosts,
-    start_maintenance_hosts
+    host_activate,
+    host_deactivate,
+    host_drain,
+    host_status
 )
 from apache.aurora.client.commands.util import AuroraClientCommandTest
 
@@ -94,7 +94,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
         patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
         patch('twitter.common.app.get_options', return_value=mock_options)):
-      start_maintenance_hosts([self.TEST_CLUSTER])
+      host_deactivate([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
 
@@ -107,14 +107,14 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
         patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
         patch('twitter.common.app.get_options', return_value=mock_options)):
-      end_maintenance_hosts([self.TEST_CLUSTER])
+      host_activate([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.endMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
       mock_scheduler_proxy.maintenanceStatus.assert_called_with(Hosts(set(self.HOSTNAMES)))
 
   def test_perform_maintenance_hosts(self):
     mock_options = self.make_mock_options()
-    mock_options.post_drain_script = None
+    mock_options.post_drain_script = 'callback'
     mock_options.grouping = 'by_host'
 
     def host_status_results(hostnames):
@@ -123,7 +123,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
       return self.create_maintenance_status_result()
 
     mock_api, mock_scheduler_proxy = self.create_mock_api()
-    mock_scheduler_proxy.endMaintenance.return_value = self.create_end_maintenance_result()
+    mock_callback = Mock()
     mock_scheduler_proxy.maintenanceStatus.side_effect = host_status_results
     mock_scheduler_proxy.startMaintenance.return_value = self.create_start_maintenance_result()
     mock_scheduler_proxy.drainHosts.return_value = self.create_start_maintenance_result()
@@ -139,20 +139,17 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
         patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
               return_value=mock_vector),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
+        patch('apache.aurora.client.commands.maintenance.parse_script', return_value=mock_callback),
         patch('twitter.common.app.get_options', return_value=mock_options)) as (
-            mock_sleep,
-            mock_scheduler_proxy_class,
-            mock_vector_class,
-            mock_clusters_maintenancepatch,
-            options):
-      perform_maintenance_hosts([self.TEST_CLUSTER])
+            mock_sleep, _, _, _, _, _):
+      host_drain([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
       #TODO(jsmith): Consider not mocking out sleep and instead refactoring
       assert mock_sleep.call_count == 3
-      assert mock_scheduler_proxy.maintenanceStatus.call_count == 6
+      assert mock_scheduler_proxy.maintenanceStatus.call_count == 3
       assert mock_scheduler_proxy.drainHosts.call_count == 3
-      assert mock_scheduler_proxy.endMaintenance.call_count == 3
+      assert mock_callback.call_count == 3
 
   def test_perform_maintenance_hosts_unknown_hosts_skipped(self):
     mock_options = self.make_mock_options()
@@ -165,7 +162,6 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
       return self.create_maintenance_status_result()
 
     mock_api, mock_scheduler_proxy = self.create_mock_api()
-    mock_scheduler_proxy.endMaintenance.return_value = self.create_end_maintenance_result()
     mock_scheduler_proxy.maintenanceStatus.side_effect = host_status_results
     mock_scheduler_proxy.startMaintenance.return_value = self.create_start_maintenance_result(
         skip_hosts=['us-grf-20'])
@@ -183,18 +179,13 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
               return_value=mock_vector),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
         patch('twitter.common.app.get_options', return_value=mock_options)) as (
-            mock_sleep,
-            mock_scheduler_proxy_class,
-            mock_vector_class,
-            mock_clusters_maintenancepatch,
-            options):
-      perform_maintenance_hosts([self.TEST_CLUSTER])
+            mock_sleep, _, _, _, _):
+      host_drain([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
       assert mock_sleep.call_count == 2
-      assert mock_scheduler_proxy.maintenanceStatus.call_count == 4
+      assert mock_scheduler_proxy.maintenanceStatus.call_count == 2
       assert mock_scheduler_proxy.drainHosts.call_count == 2
-      assert mock_scheduler_proxy.endMaintenance.call_count == 2
 
   def test_perform_maintenance_hosts_failed_default_sla(self):
     with temporary_file() as fp:
@@ -203,14 +194,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
       mock_options.grouping = 'by_host'
       mock_options.unsafe_hosts_filename = fp.name
 
-      def host_status_results(hostnames):
-        if isinstance(hostnames, Hosts):
-          return self.create_drained_status_result(hostnames)
-        return self.create_maintenance_status_result()
-
       mock_api, mock_scheduler_proxy = self.create_mock_api()
-      mock_scheduler_proxy.endMaintenance.return_value = self.create_end_maintenance_result()
-      mock_scheduler_proxy.maintenanceStatus.side_effect = host_status_results
       mock_scheduler_proxy.startMaintenance.return_value = self.create_start_maintenance_result()
       mock_scheduler_proxy.drainHosts.return_value = self.create_start_maintenance_result()
       mock_vector = self.create_mock_probe_hosts_vector([
@@ -226,10 +210,9 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
                 return_value=mock_vector),
           patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
           patch('twitter.common.app.get_options', return_value=mock_options)):
-        perform_maintenance_hosts([self.TEST_CLUSTER])
+        host_drain([self.TEST_CLUSTER])
 
         mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
-        assert mock_scheduler_proxy.endMaintenance.call_count == len(self.HOSTNAMES)
 
   def test_perform_maintenance_hosts_failed_custom_sla(self):
     with temporary_file() as fp:
@@ -241,14 +224,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
       mock_options.reason = 'Test overrides'
       mock_options.unsafe_hosts_filename = fp.name
 
-      def host_status_results(hostnames):
-        if isinstance(hostnames, Hosts):
-          return self.create_drained_status_result(hostnames)
-        return self.create_maintenance_status_result()
-
       mock_api, mock_scheduler_proxy = self.create_mock_api()
-      mock_scheduler_proxy.endMaintenance.return_value = self.create_end_maintenance_result()
-      mock_scheduler_proxy.maintenanceStatus.side_effect = host_status_results
       mock_scheduler_proxy.startMaintenance.return_value = self.create_start_maintenance_result()
       mock_scheduler_proxy.drainHosts.return_value = self.create_start_maintenance_result()
       mock_vector = self.create_mock_probe_hosts_vector([
@@ -263,14 +239,13 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
           patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
                 return_value=mock_vector),
           patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
-          patch('apache.aurora.client.commands.maintenance.log_admin_message'),
+          patch('apache.aurora.admin.admin_util.log_admin_message'),
           patch('twitter.common.app.get_options', return_value=mock_options)) as (
               _, _, _, _, log, _):
-        perform_maintenance_hosts([self.TEST_CLUSTER])
+        host_drain([self.TEST_CLUSTER])
 
         assert 'Test overrides' in log.call_args[0][1]
         mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
-        assert mock_scheduler_proxy.endMaintenance.call_count == len(self.HOSTNAMES)
 
   def test_perform_maintenance_hosts_no_prod_tasks(self):
     mock_options = self.make_mock_options()
@@ -283,7 +258,6 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
       return self.create_maintenance_status_result()
 
     mock_api, mock_scheduler_proxy = self.create_mock_api()
-    mock_scheduler_proxy.endMaintenance.return_value = self.create_end_maintenance_result()
     mock_scheduler_proxy.maintenanceStatus.side_effect = host_status_results
     mock_scheduler_proxy.startMaintenance.return_value = self.create_start_maintenance_result()
     mock_scheduler_proxy.drainHosts.return_value = self.create_start_maintenance_result()
@@ -300,19 +274,14 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
               return_value=create_empty_sla_results()),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
         patch('twitter.common.app.get_options', return_value=mock_options)) as (
-            mock_sleep,
-            mock_scheduler_proxy_class,
-            mock_vector_class,
-            mock_clusters_maintenancepatch,
-            options):
+            mock_sleep, _, _, _, _):
 
-      perform_maintenance_hosts([self.TEST_CLUSTER])
+      host_drain([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
       assert mock_sleep.call_count == 3
-      assert mock_scheduler_proxy.maintenanceStatus.call_count == 6
+      assert mock_scheduler_proxy.maintenanceStatus.call_count == 3
       assert mock_scheduler_proxy.drainHosts.call_count == 3
-      assert mock_scheduler_proxy.endMaintenance.call_count == 3
 
   def test_perform_maintenance_hosts_multiple_sla_groups_failure(self):
     mock_options = self.make_mock_options()
@@ -320,16 +289,8 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
     mock_options.grouping = 'by_host'
     mock_options.unsafe_hosts_filename = None
 
-    def host_status_results(hostnames):
-      if isinstance(hostnames, Hosts):
-        return self.create_drained_status_result(hostnames)
-      return self.create_maintenance_status_result()
-
     mock_api, mock_scheduler_proxy = self.create_mock_api()
-    mock_scheduler_proxy.endMaintenance.return_value = self.create_end_maintenance_result()
-    mock_scheduler_proxy.maintenanceStatus.side_effect = host_status_results
     mock_scheduler_proxy.startMaintenance.return_value = self.create_start_maintenance_result()
-    mock_scheduler_proxy.drainHosts.return_value = self.create_start_maintenance_result()
 
     def create_multiple_sla_results():
       mock_vector = Mock()
@@ -337,25 +298,15 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
       return mock_vector
 
     with contextlib.nested(
-        patch('time.sleep'),
         patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
         patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
               return_value=create_multiple_sla_results()),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
-        patch('twitter.common.app.get_options', return_value=mock_options)) as (
-            mock_sleep,
-            mock_scheduler_proxy_class,
-            mock_vector_class,
-            mock_clusters_maintenancepatch,
-            options):
+        patch('twitter.common.app.get_options', return_value=mock_options)):
 
-      perform_maintenance_hosts([self.TEST_CLUSTER])
+      host_drain([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
-      assert mock_sleep.call_count == 0
-      assert mock_scheduler_proxy.maintenanceStatus.call_count == 3
-      assert mock_scheduler_proxy.drainHosts.call_count == 0
-      assert mock_scheduler_proxy.endMaintenance.call_count == 3
 
   def test_perform_maintenance_hosts_reason_missing(self):
     mock_options = self.make_mock_options()
@@ -365,12 +316,8 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
 
     with contextlib.nested(
         patch('twitter.common.app.get_options', return_value=mock_options)):
-      try:
-        perform_maintenance_hosts([self.TEST_CLUSTER])
-      except SystemExit:
-        pass
-      else:
-        assert 'Expected error is not raised.'
+      self.assertRaises(SystemExit, host_drain, [self.TEST_CLUSTER])
+
 
   def test_host_maintenance_status(self):
     mock_options = self.make_mock_options()
@@ -380,6 +327,6 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
         patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
         patch('twitter.common.app.get_options', return_value=mock_options)):
-      host_maintenance_status([self.TEST_CLUSTER])
+      host_status([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.maintenanceStatus.assert_called_with(Hosts(set(self.HOSTNAMES)))