You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@aurora.apache.org by ma...@apache.org on 2014/10/17 21:54:20 UTC

git commit: Adding wait loop into host_drain status monitoring.

Repository: incubator-aurora
Updated Branches:
  refs/heads/master 9ef14e78d -> b5d66b3e2


Adding wait loop into host_drain status monitoring.

Bugs closed: AURORA-820

Reviewed at https://reviews.apache.org/r/26458/


Project: http://git-wip-us.apache.org/repos/asf/incubator-aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-aurora/commit/b5d66b3e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-aurora/tree/b5d66b3e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-aurora/diff/b5d66b3e

Branch: refs/heads/master
Commit: b5d66b3e25135ccf74a92f1cf53df8bf126dce94
Parents: 9ef14e7
Author: Maxim Khutornenko <ma...@apache.org>
Authored: Fri Oct 17 12:53:36 2014 -0700
Committer: Maxim Khutornenko <ma...@apache.org>
Committed: Fri Oct 17 12:53:36 2014 -0700

----------------------------------------------------------------------
 .../apache/aurora/admin/host_maintenance.py     | 51 ++++++++----
 .../aurora/admin/test_host_maintenance.py       | 86 +++++++++++++++-----
 .../aurora/client/commands/test_maintenance.py  | 12 ++-
 3 files changed, 111 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/b5d66b3e/src/main/python/apache/aurora/admin/host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/admin/host_maintenance.py b/src/main/python/apache/aurora/admin/host_maintenance.py
index 9c2a9f7..697de4b 100644
--- a/src/main/python/apache/aurora/admin/host_maintenance.py
+++ b/src/main/python/apache/aurora/admin/host_maintenance.py
@@ -11,9 +11,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from threading import Event
 
 from twitter.common import log
-from twitter.common.quantity import Time
+from twitter.common.quantity import Amount, Time
 
 from apache.aurora.admin.admin_util import format_sla_results, print_results
 from apache.aurora.client.api import AuroraClientAPI
@@ -36,6 +37,8 @@ class HostMaintenance(object):
   """
 
   SLA_MIN_JOB_INSTANCE_COUNT = 20
+  STATUS_POLL_INTERVAL = Amount(5, Time.SECONDS)
+  MAX_STATUS_WAIT = Amount(5, Time.MINUTES)
 
   @classmethod
   def iter_batches(cls, hostnames, grouping_function=DEFAULT_GROUPING):
@@ -44,8 +47,21 @@ class HostMaintenance(object):
     for group in groups:
       yield Hosts(group[1])
 
-  def __init__(self, cluster, verbosity):
+  def __init__(self, cluster, verbosity, wait_event=None):
     self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
+    self._wait_event = wait_event or Event()
+
+  def check_if_drained(self, hostnames):
+    """Checks if host names reached DRAINED status.
+
+    :param hostnames: Host names to check for DRAINED status
+    :type hostnames: list of strings
+    :rtype: set of host names not in DRAINED state
+    """
+    statuses = self.check_status(hostnames)
+    not_ready_hostnames = [h[0] for h in statuses if h[1] != 'DRAINED']
+    log.info('Waiting for hosts to be in DRAINED: %s' % not_ready_hostnames)
+    return set(not_ready_hostnames)
 
   def _drain_hosts(self, drainable_hosts):
     """"Drains tasks from the specified hosts.
@@ -55,19 +71,24 @@ class HostMaintenance(object):
 
     :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
     :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
+    :rtype: set of host names failed to drain
     """
     check_and_log_response(self._client.drain_hosts(drainable_hosts))
-    not_ready_hostnames = [hostname for hostname in drainable_hosts.hostNames]
-    while not_ready_hostnames:
-      resp = self._client.maintenance_status(Hosts(set(not_ready_hostnames)))
-      if not resp.result.maintenanceStatusResult.statuses:
-        not_ready_hostnames = None
-      for host_status in resp.result.maintenanceStatusResult.statuses:
-        if host_status.mode != MaintenanceMode.DRAINED:
-          log.warning('%s is currently in status %s' %
-              (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
-        else:
-          not_ready_hostnames.remove(host_status.host)
+    drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames]
+
+    total_wait = self.STATUS_POLL_INTERVAL
+    not_drained_hostnames = set(drainable_hostnames)
+    while not self._wait_event.is_set() and not_drained_hostnames:
+      self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))
+
+      not_drained_hostnames = self.check_if_drained(drainable_hostnames)
+
+      total_wait += self.STATUS_POLL_INTERVAL
+      if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
+        log.warning('Failed to move all hosts into DRAINED within %s' % self.MAX_STATUS_WAIT)
+        break
+
+    return not_drained_hostnames
 
   def _complete_maintenance(self, drained_hosts):
     """End the maintenance status for a given set of hosts.
@@ -186,11 +207,11 @@ class HostMaintenance(object):
       else:
         log.info('All hosts passed SLA check.')
 
-      self._drain_hosts(hosts)
+      not_drained_hostnames |= self._drain_hosts(hosts)
 
     if not_drained_hostnames:
       output = '\n'.join(list(not_drained_hostnames))
-      log.info('The following hosts did not pass SLA check and were not drained:')
+      log.info('The following hosts WERE NOT DRAINED due to failed SLA check or external failures:')
       print(output)
       if output_file:
         try:

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/b5d66b3e/src/test/python/apache/aurora/admin/test_host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/admin/test_host_maintenance.py b/src/test/python/apache/aurora/admin/test_host_maintenance.py
index 40228df..246081a 100644
--- a/src/test/python/apache/aurora/admin/test_host_maintenance.py
+++ b/src/test/python/apache/aurora/admin/test_host_maintenance.py
@@ -19,6 +19,7 @@ from contextlib import contextmanager
 import mock
 from twitter.common import log
 from twitter.common.contextutil import temporary_file
+from twitter.common.quantity import Amount, Time
 
 from apache.aurora.admin.host_maintenance import HostMaintenance
 from apache.aurora.client.api import AuroraClientAPI
@@ -51,27 +52,38 @@ class TestHostMaintenance(unittest.TestCase):
       spec=AuroraClientAPI.maintenance_status)
   @mock.patch("apache.aurora.client.api.AuroraClientAPI.drain_hosts",
       spec=AuroraClientAPI.drain_hosts)
-  def test_drain_hosts(self, mock_drain_hosts, mock_maintenance_status):
+  @mock.patch("threading._Event.wait")
+  def test_drain_hosts(self, mock_event_wait, mock_drain_hosts, mock_maintenance_status):
     fake_maintenance_status_response = [
-        Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
-            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED),
-            HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED),
-            HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED)
-        ])))),
-        Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
-            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
-            HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINING),
-            HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINING)
-        ])))),
-        Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
-            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
-            HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINED),
-            HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINED)
-        ])))),
-        Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
-            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINED)
-        ]))))
-    ]
+        Response(
+            responseCode=ResponseCode.OK,
+            result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+                HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED),
+                HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED),
+                HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED)
+            ])))),
+        Response(
+            responseCode=ResponseCode.OK,
+            result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+                HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
+                HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINING),
+                HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINING)
+            ])))),
+        Response(
+            responseCode=ResponseCode.OK,
+            result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+                HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
+                HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINED),
+                HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINED)
+            ])))),
+        Response(
+            responseCode=ResponseCode.OK,
+            result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+                HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINED),
+                HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINED),
+                HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINED)
+            ]))))]
+
     fake_maintenance_status_call_args = []
     def fake_maintenance_status_side_effect(hosts):
       fake_maintenance_status_call_args.append(copy.deepcopy(hosts))
@@ -81,14 +93,43 @@ class TestHostMaintenance(unittest.TestCase):
     mock_maintenance_status.side_effect = fake_maintenance_status_side_effect
     test_hosts = Hosts(set(TEST_HOSTNAMES))
     maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
-    maintenance._drain_hosts(test_hosts)
+
+    not_drained_hostnames = maintenance._drain_hosts(test_hosts)
+    assert len(not_drained_hostnames) == 0
     mock_drain_hosts.assert_called_once_with(test_hosts)
     assert mock_maintenance_status.call_count == 4
+    assert mock_event_wait.call_count == 4
     assert fake_maintenance_status_call_args == [
         (Hosts(set(TEST_HOSTNAMES))),
         (Hosts(set(TEST_HOSTNAMES))),
         (Hosts(set(TEST_HOSTNAMES))),
-        (Hosts(set([TEST_HOSTNAMES[0]])))]
+        (Hosts(set(TEST_HOSTNAMES)))]
+
+  @mock.patch("apache.aurora.client.api.AuroraClientAPI.maintenance_status",
+              spec=AuroraClientAPI.maintenance_status)
+  @mock.patch("apache.aurora.client.api.AuroraClientAPI.drain_hosts",
+              spec=AuroraClientAPI.drain_hosts)
+  @mock.patch("threading._Event.wait")
+  def test_drain_hosts_timed_out_wait(self, _, mock_drain_hosts, mock_maintenance_status):
+    fake_maintenance_status_response = Response(
+        responseCode=ResponseCode.OK,
+        result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+          HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED),
+          HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED),
+          HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED)
+        ]))))
+
+    mock_drain_hosts.return_value = Response(responseCode=ResponseCode.OK)
+    mock_maintenance_status.return_value = fake_maintenance_status_response
+    test_hosts = Hosts(set(TEST_HOSTNAMES))
+    maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
+    maintenance.MAX_STATUS_WAIT = Amount(1, Time.MILLISECONDS)
+
+    not_drained_hostnames = maintenance._drain_hosts(test_hosts)
+    assert TEST_HOSTNAMES == sorted(not_drained_hostnames)
+    assert mock_maintenance_status.call_count == 1
+    mock_drain_hosts.assert_called_once_with(test_hosts)
+    mock_maintenance_status.assert_called_once_with((Hosts(set(TEST_HOSTNAMES))))
 
   @mock.patch("twitter.common.log.warning", spec=log.warning)
   @mock.patch("apache.aurora.client.api.AuroraClientAPI.maintenance_status",
@@ -156,6 +197,7 @@ class TestHostMaintenance(unittest.TestCase):
     mock_check_sla.return_value = set([failed_host])
     mock_start_maintenance.return_value = TEST_HOSTNAMES
     drained_hosts = set(TEST_HOSTNAMES) - set([failed_host])
+    mock_drain_hosts.return_value = set()
     maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
 
     with temporary_file() as fp:

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/b5d66b3e/src/test/python/apache/aurora/client/commands/test_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/commands/test_maintenance.py b/src/test/python/apache/aurora/client/commands/test_maintenance.py
index d86aaf6..13d753f 100644
--- a/src/test/python/apache/aurora/client/commands/test_maintenance.py
+++ b/src/test/python/apache/aurora/client/commands/test_maintenance.py
@@ -17,6 +17,7 @@ import contextlib
 from mock import Mock, patch
 from twitter.common.contextutil import temporary_file
 
+from apache.aurora.admin.host_maintenance import HostMaintenance
 from apache.aurora.client.commands.maintenance import (
     host_activate,
     host_deactivate,
@@ -139,6 +140,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
               return_value=mock_vector),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
         patch('apache.aurora.client.commands.maintenance.parse_script', return_value=mock_callback),
+        patch('threading._Event.wait'),
         patch('twitter.common.app.get_options', return_value=mock_options)):
       host_drain([self.TEST_CLUSTER])
 
@@ -173,6 +175,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
         patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
               return_value=mock_vector),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
+        patch('threading._Event.wait'),
         patch('twitter.common.app.get_options', return_value=mock_options)):
       host_drain([self.TEST_CLUSTER])
 
@@ -201,6 +204,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
           patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
                 return_value=mock_vector),
           patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
+          patch('threading._Event.wait'),
           patch('twitter.common.app.get_options', return_value=mock_options)):
         host_drain([self.TEST_CLUSTER])
 
@@ -224,6 +228,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
           self.create_probe_hosts(self.HOSTNAMES[1], 95, False, None),
           self.create_probe_hosts(self.HOSTNAMES[2], 95, False, None)
       ])
+      mock_wait = Mock()
 
       with contextlib.nested(
           patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
@@ -231,11 +236,15 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
                 return_value=mock_vector),
           patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
           patch('apache.aurora.admin.admin_util.log_admin_message'),
-          patch('twitter.common.app.get_options', return_value=mock_options)) as (_, _, _, log, _):
+          patch('threading._Event.wait', return_value=mock_wait),
+          patch('twitter.common.app.get_options', return_value=mock_options)
+      ) as (_, _, _, log, _, _):
+
         host_drain([self.TEST_CLUSTER])
 
         assert 'Test overrides' in log.call_args[0][1]
         mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
+        mock_wait.called_once_with(HostMaintenance.MAX_STATUS_WAIT)
 
   def test_perform_maintenance_hosts_no_prod_tasks(self):
     mock_options = self.make_mock_options()
@@ -262,6 +271,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
         patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
               return_value=create_empty_sla_results()),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
+        patch('threading._Event.wait'),
         patch('twitter.common.app.get_options', return_value=mock_options)):
 
       host_drain([self.TEST_CLUSTER])