You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ds...@apache.org on 2015/12/04 17:25:09 UTC
ambari git commit: AMBARI-14182 Recovery alerts do not go away (dsen)
Repository: ambari
Updated Branches:
refs/heads/trunk e591e9093 -> 49ffd98e0
AMBARI-14182 Recovery alerts do not go away (dsen)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/49ffd98e
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/49ffd98e
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/49ffd98e
Branch: refs/heads/trunk
Commit: 49ffd98e0a14a3ab1d461dc4013c8a8b77688411
Parents: e591e90
Author: Dmytro Sen <ds...@apache.org>
Authored: Fri Dec 4 18:24:57 2015 +0200
Committer: Dmytro Sen <ds...@apache.org>
Committed: Fri Dec 4 18:24:57 2015 +0200
----------------------------------------------------------------------
.../main/python/ambari_agent/RecoveryManager.py | 32 ++++++++--
.../ambari_agent/alerts/recovery_alert.py | 16 +++--
.../src/test/python/ambari_agent/TestAlerts.py | 34 ++++++++++-
.../python/ambari_agent/TestRecoveryManager.py | 62 +++++++++++++++++++-
ambari-server/conf/unix/ambari.properties | 1 +
5 files changed, 131 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py b/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py
index cd8e839..a7abb8a 100644
--- a/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py
+++ b/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py
@@ -63,7 +63,7 @@ class RecoveryManager:
"lastAttempt": 0,
"count": 0,
"lastReset": 0,
- "lifetimeCount" : 0,
+ "lifetimeCount": 0,
"warnedLastAttempt": False,
"warnedLastReset": False,
"warnedThresholdReached": False
@@ -105,7 +105,7 @@ class RecoveryManager:
self.__actions_json_file = os.path.join(cache_dir, self.FILENAME)
- self.actions = self._load_actions()
+ self.actions = {}
self.update_config(6, 60, 5, 12, recovery_enabled, auto_start_only, "", "")
@@ -381,6 +381,11 @@ class RecoveryManager:
executed = False
seconds_since_last_attempt = now - action_counter["lastAttempt"]
if action_counter["lifetimeCount"] < self.max_lifetime_count:
+ #reset if window_in_sec seconds passed since last attempt
+ if seconds_since_last_attempt > self.window_in_sec:
+ action_counter["count"] = 0
+ action_counter["lastReset"] = now
+ action_counter["warnedLastReset"] = False
if action_counter["count"] < self.max_count:
if seconds_since_last_attempt > self.retry_gap_in_sec:
action_counter["count"] += 1
@@ -428,7 +433,7 @@ class RecoveryManager:
"Will silently skip execution without warning till window is reset",
action_counter["lifetimeCount"], action_name)
else:
- logger.debug("%s occurrences in agent life time reached the limit for %s",
+ logger.error("%s occurrences in agent life time reached the limit for %s",
action_counter["lifetimeCount"], action_name)
self._dump_actions()
return executed
@@ -474,13 +479,30 @@ class RecoveryManager:
def get_actions_copy(self):
"""
- Loads recovery actions from FS
:return: recovery actions copy
"""
- return self._load_actions()
+ self.__status_lock.acquire()
+ try:
+ return copy.deepcopy(self.actions)
+ finally:
+ self.__status_lock.release()
pass
+ def is_action_info_stale(self, action_name):
+ """
+ Checks if the action info is stale
+ :param action_name:
+ :return: if the action info for action_name: is stale
+ """
+ if action_name in self.actions:
+ action_counter = self.actions[action_name]
+ now = self._now_()
+ seconds_since_last_attempt = now - action_counter["lastAttempt"]
+ return seconds_since_last_attempt > self.window_in_sec
+ return False
+ pass
+
def _execute_action_chk_only(self, action_name):
"""
_private_ implementation of [may] execute check only
http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py
index 60744b5..760a737 100644
--- a/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py
+++ b/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py
@@ -69,16 +69,20 @@ class RecoveryAlert(BaseAlert):
if component in recovery_actions:
recovery_action_info = recovery_actions[component]
- recovered_times = 0
- if 'count' in recovery_action_info:
- recovered_times = recovery_action_info['count']
- lastResetText = ""
- if 'lastReset' in recovery_action_info:
- lastResetText = " since " + str(datetime.datetime.fromtimestamp(recovery_action_info['lastReset']))
warned_threshold_reached = False
if 'warnedThresholdReached' in recovery_action_info:
warned_threshold_reached = recovery_action_info['warnedThresholdReached']
+ recovered_times = 0
+ lastResetText = ""
+
+ # The alert should not go away if warned_threshold_reached (max_lifetime_count reached)
+ if not self.recovery_manager.is_action_info_stale(component) or warned_threshold_reached:
+ if 'count' in recovery_action_info:
+ recovered_times = recovery_action_info['count']
+ if 'lastReset' in recovery_action_info:
+ lastResetText = " since " + str(datetime.datetime.fromtimestamp(recovery_action_info['lastReset']))
+
if recovered_times >= self.critical_count or warned_threshold_reached:
result = self.RESULT_CRITICAL
elif recovered_times >= self.warning_count:
http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
index a01bb2c..8344238 100644
--- a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
+++ b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
@@ -112,9 +112,11 @@ class TestAlerts(TestCase):
self.assertEquals(0, len(collector.alerts()))
self.assertEquals('CRITICAL', alerts[0]['state'])
+ @patch.object(RecoveryManager, "is_action_info_stale")
@patch.object(RecoveryManager, "get_actions_copy")
- def test_recovery_alert(self, rm_get_actions_mock):
+ def test_recovery_alert(self, rm_get_actions_mock, is_stale_mock):
definition_json = self._get_recovery_alert_definition()
+ is_stale_mock.return_value = False
rm_get_actions_mock.return_value = {
"METRICS_COLLECTOR": {
"count": 0,
@@ -177,6 +179,36 @@ class TestAlerts(TestCase):
self.assertEquals(0, len(collector.alerts()))
self.assertEquals('CRITICAL', alerts[0]['state'])
+ # OK again, after recovery manager window expired
+ is_stale_mock.return_value = True
+
+ alert.collect()
+ alerts = collector.alerts()
+ self.assertEquals(0, len(collector.alerts()))
+ self.assertEquals('OK', alerts[0]['state'])
+
+ # CRIT, after recovery manager window expired,
+ # but max_lifetime_count reached, warnedThresholdReached == True
+ rm_get_actions_mock.return_value = {
+ "METRICS_COLLECTOR": {
+ "count": 5,
+ "lastAttempt": 1447860184,
+ "warnedLastReset": False,
+ "lastReset": 1447860184,
+ "warnedThresholdReached": True,
+ "lifetimeCount": 12,
+ "warnedLastAttempt": False
+ }
+ }
+
+ is_stale_mock.return_value = True
+
+ alert.collect()
+ alerts = collector.alerts()
+ self.assertEquals(0, len(collector.alerts()))
+ self.assertEquals('CRITICAL', alerts[0]['state'])
+
+
@patch.object(socket.socket,"connect")
def test_port_alert_complex_uri(self, socket_connect_mock):
definition_json = self._get_port_alert_definition()
http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py b/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py
index 1335dab..a2b4968 100644
--- a/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py
+++ b/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py
@@ -25,7 +25,7 @@ from ambari_agent.RecoveryManager import RecoveryManager
from mock.mock import patch, MagicMock, call
-class TestRecoveryManager(TestCase):
+class _TestRecoveryManager(TestCase):
command = {
"commandType": "STATUS_COMMAND",
"payloadLevel": "EXECUTION_COMMAND",
@@ -516,7 +516,7 @@ class TestRecoveryManager(TestCase):
@patch.object(RecoveryManager, "_now_")
def test_recovery_report(self, time_mock):
time_mock.side_effect = \
- [1000, 1071, 1072, 1470, 1471, 1472, 1543, 1644, 1715]
+ [1000, 1071, 1072, 1470, 1471, 1472, 1543, 1644, 1815]
rm = RecoveryManager(tempfile.mktemp())
rec_st = rm.get_recovery_status()
@@ -630,3 +630,61 @@ class TestRecoveryManager(TestCase):
self.assertTrue(rm.configured_for_recovery("D"))
self.assertFalse(rm.configured_for_recovery("E"))
self.assertTrue(rm.configured_for_recovery("F"))
+
+ @patch.object(RecoveryManager, "_now_")
+ def test_reset_if_window_passed_since_last_attempt(self, time_mock):
+ time_mock.side_effect = \
+ [1000, 1071, 1372]
+ rm = RecoveryManager(tempfile.mktemp(), True)
+
+ rm.update_config(2, 5, 1, 4, True, True, "", "")
+
+ rm.execute("COMPONENT")
+ actions = rm.get_actions_copy()["COMPONENT"]
+ self.assertEquals(actions['lastReset'], 1000)
+ rm.execute("COMPONENT")
+ actions = rm.get_actions_copy()["COMPONENT"]
+ self.assertEquals(actions['lastReset'], 1000)
+ #reset if window_in_sec seconds passed since last attempt
+ rm.execute("COMPONENT")
+ actions = rm.get_actions_copy()["COMPONENT"]
+ self.assertEquals(actions['lastReset'], 1372)
+
+
+ @patch.object(RecoveryManager, "_now_")
+ def test_is_action_info_stale(self, time_mock):
+
+ rm = RecoveryManager(tempfile.mktemp(), True)
+ rm.update_config(5, 60, 5, 16, True, False, "", "")
+
+ # rm.actions = {}
+
+ time_mock.return_value = 0
+ self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME"))
+
+ rm.actions["COMPONENT_NAME"] = {
+ "lastAttempt": 0,
+ "count": 0,
+ "lastReset": 0,
+ "lifetimeCount": 0,
+ "warnedLastAttempt": False,
+ "warnedLastReset": False,
+ "warnedThresholdReached": False
+ }
+ time_mock.return_value = 3600
+ self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME"))
+
+ rm.actions["COMPONENT_NAME"] = {
+ "lastAttempt": 1,
+ "count": 1,
+ "lastReset": 0,
+ "lifetimeCount": 1,
+ "warnedLastAttempt": False,
+ "warnedLastReset": False,
+ "warnedThresholdReached": False
+ }
+ time_mock.return_value = 3601
+ self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME"))
+
+ time_mock.return_value = 3602
+ self.assertTrue(rm.is_action_info_stale("COMPONENT_NAME"))
http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-server/conf/unix/ambari.properties
----------------------------------------------------------------------
diff --git a/ambari-server/conf/unix/ambari.properties b/ambari-server/conf/unix/ambari.properties
index 3dac63e..fd1b91e 100644
--- a/ambari-server/conf/unix/ambari.properties
+++ b/ambari-server/conf/unix/ambari.properties
@@ -108,3 +108,4 @@ views.http.x-frame-options=SAMEORIGIN
# Enable Metrics Collector auto-restart
recovery.type=AUTO_START
recovery.enabled_components=METRICS_COLLECTOR
+recovery.lifetime_max_count=1024
\ No newline at end of file