You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@aurora.apache.org by ma...@apache.org on 2014/10/17 21:54:20 UTC
git commit: Adding wait loop into host_drain status monitoring.
Repository: incubator-aurora
Updated Branches:
refs/heads/master 9ef14e78d -> b5d66b3e2
Adding wait loop into host_drain status monitoring.
Bugs closed: AURORA-820
Reviewed at https://reviews.apache.org/r/26458/
Project: http://git-wip-us.apache.org/repos/asf/incubator-aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-aurora/commit/b5d66b3e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-aurora/tree/b5d66b3e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-aurora/diff/b5d66b3e
Branch: refs/heads/master
Commit: b5d66b3e25135ccf74a92f1cf53df8bf126dce94
Parents: 9ef14e7
Author: Maxim Khutornenko <ma...@apache.org>
Authored: Fri Oct 17 12:53:36 2014 -0700
Committer: Maxim Khutornenko <ma...@apache.org>
Committed: Fri Oct 17 12:53:36 2014 -0700
----------------------------------------------------------------------
.../apache/aurora/admin/host_maintenance.py | 51 ++++++++----
.../aurora/admin/test_host_maintenance.py | 86 +++++++++++++++-----
.../aurora/client/commands/test_maintenance.py | 12 ++-
3 files changed, 111 insertions(+), 38 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/b5d66b3e/src/main/python/apache/aurora/admin/host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/admin/host_maintenance.py b/src/main/python/apache/aurora/admin/host_maintenance.py
index 9c2a9f7..697de4b 100644
--- a/src/main/python/apache/aurora/admin/host_maintenance.py
+++ b/src/main/python/apache/aurora/admin/host_maintenance.py
@@ -11,9 +11,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
+from threading import Event
from twitter.common import log
-from twitter.common.quantity import Time
+from twitter.common.quantity import Amount, Time
from apache.aurora.admin.admin_util import format_sla_results, print_results
from apache.aurora.client.api import AuroraClientAPI
@@ -36,6 +37,8 @@ class HostMaintenance(object):
"""
SLA_MIN_JOB_INSTANCE_COUNT = 20
+ STATUS_POLL_INTERVAL = Amount(5, Time.SECONDS)
+ MAX_STATUS_WAIT = Amount(5, Time.MINUTES)
@classmethod
def iter_batches(cls, hostnames, grouping_function=DEFAULT_GROUPING):
@@ -44,8 +47,21 @@ class HostMaintenance(object):
for group in groups:
yield Hosts(group[1])
- def __init__(self, cluster, verbosity):
+ def __init__(self, cluster, verbosity, wait_event=None):
self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
+ self._wait_event = wait_event or Event()
+
+ def check_if_drained(self, hostnames):
+ """Checks if host names reached DRAINED status.
+
+ :param hostnames: Host names to check for DRAINED status
+ :type hostnames: list of strings
+ :rtype: set of host names not in DRAINED state
+ """
+ statuses = self.check_status(hostnames)
+ not_ready_hostnames = [h[0] for h in statuses if h[1] != 'DRAINED']
+ log.info('Waiting for hosts to be in DRAINED: %s' % not_ready_hostnames)
+ return set(not_ready_hostnames)
def _drain_hosts(self, drainable_hosts):
""""Drains tasks from the specified hosts.
@@ -55,19 +71,24 @@ class HostMaintenance(object):
:param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
:type drainable_hosts: gen.apache.aurora.ttypes.Hosts
+ :rtype: set of host names failed to drain
"""
check_and_log_response(self._client.drain_hosts(drainable_hosts))
- not_ready_hostnames = [hostname for hostname in drainable_hosts.hostNames]
- while not_ready_hostnames:
- resp = self._client.maintenance_status(Hosts(set(not_ready_hostnames)))
- if not resp.result.maintenanceStatusResult.statuses:
- not_ready_hostnames = None
- for host_status in resp.result.maintenanceStatusResult.statuses:
- if host_status.mode != MaintenanceMode.DRAINED:
- log.warning('%s is currently in status %s' %
- (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
- else:
- not_ready_hostnames.remove(host_status.host)
+ drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames]
+
+ total_wait = self.STATUS_POLL_INTERVAL
+ not_drained_hostnames = set(drainable_hostnames)
+ while not self._wait_event.is_set() and not_drained_hostnames:
+ self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))
+
+ not_drained_hostnames = self.check_if_drained(drainable_hostnames)
+
+ total_wait += self.STATUS_POLL_INTERVAL
+ if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
+ log.warning('Failed to move all hosts into DRAINED within %s' % self.MAX_STATUS_WAIT)
+ break
+
+ return not_drained_hostnames
def _complete_maintenance(self, drained_hosts):
"""End the maintenance status for a given set of hosts.
@@ -186,11 +207,11 @@ class HostMaintenance(object):
else:
log.info('All hosts passed SLA check.')
- self._drain_hosts(hosts)
+ not_drained_hostnames |= self._drain_hosts(hosts)
if not_drained_hostnames:
output = '\n'.join(list(not_drained_hostnames))
- log.info('The following hosts did not pass SLA check and were not drained:')
+ log.info('The following hosts WERE NOT DRAINED due to failed SLA check or external failures:')
print(output)
if output_file:
try:
http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/b5d66b3e/src/test/python/apache/aurora/admin/test_host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/admin/test_host_maintenance.py b/src/test/python/apache/aurora/admin/test_host_maintenance.py
index 40228df..246081a 100644
--- a/src/test/python/apache/aurora/admin/test_host_maintenance.py
+++ b/src/test/python/apache/aurora/admin/test_host_maintenance.py
@@ -19,6 +19,7 @@ from contextlib import contextmanager
import mock
from twitter.common import log
from twitter.common.contextutil import temporary_file
+from twitter.common.quantity import Amount, Time
from apache.aurora.admin.host_maintenance import HostMaintenance
from apache.aurora.client.api import AuroraClientAPI
@@ -51,27 +52,38 @@ class TestHostMaintenance(unittest.TestCase):
spec=AuroraClientAPI.maintenance_status)
@mock.patch("apache.aurora.client.api.AuroraClientAPI.drain_hosts",
spec=AuroraClientAPI.drain_hosts)
- def test_drain_hosts(self, mock_drain_hosts, mock_maintenance_status):
+ @mock.patch("threading._Event.wait")
+ def test_drain_hosts(self, mock_event_wait, mock_drain_hosts, mock_maintenance_status):
fake_maintenance_status_response = [
- Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
- HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED),
- HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED),
- HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED)
- ])))),
- Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
- HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
- HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINING),
- HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINING)
- ])))),
- Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
- HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
- HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINED),
- HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINED)
- ])))),
- Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
- HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINED)
- ]))))
- ]
+ Response(
+ responseCode=ResponseCode.OK,
+ result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+ HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED),
+ HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED),
+ HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED)
+ ])))),
+ Response(
+ responseCode=ResponseCode.OK,
+ result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+ HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
+ HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINING),
+ HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINING)
+ ])))),
+ Response(
+ responseCode=ResponseCode.OK,
+ result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+ HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
+ HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINED),
+ HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINED)
+ ])))),
+ Response(
+ responseCode=ResponseCode.OK,
+ result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+ HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINED),
+ HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINED),
+ HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINED)
+ ]))))]
+
fake_maintenance_status_call_args = []
def fake_maintenance_status_side_effect(hosts):
fake_maintenance_status_call_args.append(copy.deepcopy(hosts))
@@ -81,14 +93,43 @@ class TestHostMaintenance(unittest.TestCase):
mock_maintenance_status.side_effect = fake_maintenance_status_side_effect
test_hosts = Hosts(set(TEST_HOSTNAMES))
maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
- maintenance._drain_hosts(test_hosts)
+
+ not_drained_hostnames = maintenance._drain_hosts(test_hosts)
+ assert len(not_drained_hostnames) == 0
mock_drain_hosts.assert_called_once_with(test_hosts)
assert mock_maintenance_status.call_count == 4
+ assert mock_event_wait.call_count == 4
assert fake_maintenance_status_call_args == [
(Hosts(set(TEST_HOSTNAMES))),
(Hosts(set(TEST_HOSTNAMES))),
(Hosts(set(TEST_HOSTNAMES))),
- (Hosts(set([TEST_HOSTNAMES[0]])))]
+ (Hosts(set(TEST_HOSTNAMES)))]
+
+ @mock.patch("apache.aurora.client.api.AuroraClientAPI.maintenance_status",
+ spec=AuroraClientAPI.maintenance_status)
+ @mock.patch("apache.aurora.client.api.AuroraClientAPI.drain_hosts",
+ spec=AuroraClientAPI.drain_hosts)
+ @mock.patch("threading._Event.wait")
+ def test_drain_hosts_timed_out_wait(self, _, mock_drain_hosts, mock_maintenance_status):
+ fake_maintenance_status_response = Response(
+ responseCode=ResponseCode.OK,
+ result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+ HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED),
+ HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED),
+ HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED)
+ ]))))
+
+ mock_drain_hosts.return_value = Response(responseCode=ResponseCode.OK)
+ mock_maintenance_status.return_value = fake_maintenance_status_response
+ test_hosts = Hosts(set(TEST_HOSTNAMES))
+ maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
+ maintenance.MAX_STATUS_WAIT = Amount(1, Time.MILLISECONDS)
+
+ not_drained_hostnames = maintenance._drain_hosts(test_hosts)
+ assert TEST_HOSTNAMES == sorted(not_drained_hostnames)
+ assert mock_maintenance_status.call_count == 1
+ mock_drain_hosts.assert_called_once_with(test_hosts)
+ mock_maintenance_status.assert_called_once_with((Hosts(set(TEST_HOSTNAMES))))
@mock.patch("twitter.common.log.warning", spec=log.warning)
@mock.patch("apache.aurora.client.api.AuroraClientAPI.maintenance_status",
@@ -156,6 +197,7 @@ class TestHostMaintenance(unittest.TestCase):
mock_check_sla.return_value = set([failed_host])
mock_start_maintenance.return_value = TEST_HOSTNAMES
drained_hosts = set(TEST_HOSTNAMES) - set([failed_host])
+ mock_drain_hosts.return_value = set()
maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
with temporary_file() as fp:
http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/b5d66b3e/src/test/python/apache/aurora/client/commands/test_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/commands/test_maintenance.py b/src/test/python/apache/aurora/client/commands/test_maintenance.py
index d86aaf6..13d753f 100644
--- a/src/test/python/apache/aurora/client/commands/test_maintenance.py
+++ b/src/test/python/apache/aurora/client/commands/test_maintenance.py
@@ -17,6 +17,7 @@ import contextlib
from mock import Mock, patch
from twitter.common.contextutil import temporary_file
+from apache.aurora.admin.host_maintenance import HostMaintenance
from apache.aurora.client.commands.maintenance import (
host_activate,
host_deactivate,
@@ -139,6 +140,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
return_value=mock_vector),
patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
patch('apache.aurora.client.commands.maintenance.parse_script', return_value=mock_callback),
+ patch('threading._Event.wait'),
patch('twitter.common.app.get_options', return_value=mock_options)):
host_drain([self.TEST_CLUSTER])
@@ -173,6 +175,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
return_value=mock_vector),
patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
+ patch('threading._Event.wait'),
patch('twitter.common.app.get_options', return_value=mock_options)):
host_drain([self.TEST_CLUSTER])
@@ -201,6 +204,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
return_value=mock_vector),
patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
+ patch('threading._Event.wait'),
patch('twitter.common.app.get_options', return_value=mock_options)):
host_drain([self.TEST_CLUSTER])
@@ -224,6 +228,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
self.create_probe_hosts(self.HOSTNAMES[1], 95, False, None),
self.create_probe_hosts(self.HOSTNAMES[2], 95, False, None)
])
+ mock_wait = Mock()
with contextlib.nested(
patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
@@ -231,11 +236,15 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
return_value=mock_vector),
patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
patch('apache.aurora.admin.admin_util.log_admin_message'),
- patch('twitter.common.app.get_options', return_value=mock_options)) as (_, _, _, log, _):
+ patch('threading._Event.wait', return_value=mock_wait),
+ patch('twitter.common.app.get_options', return_value=mock_options)
+ ) as (_, _, _, log, _, _):
+
host_drain([self.TEST_CLUSTER])
assert 'Test overrides' in log.call_args[0][1]
mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
+ mock_wait.called_once_with(HostMaintenance.MAX_STATUS_WAIT)
def test_perform_maintenance_hosts_no_prod_tasks(self):
mock_options = self.make_mock_options()
@@ -262,6 +271,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
return_value=create_empty_sla_results()),
patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
+ patch('threading._Event.wait'),
patch('twitter.common.app.get_options', return_value=mock_options)):
host_drain([self.TEST_CLUSTER])