You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ds...@apache.org on 2015/12/04 17:25:09 UTC

ambari git commit: AMBARI-14182 Recovery alerts do not go away (dsen)

Repository: ambari
Updated Branches:
  refs/heads/trunk e591e9093 -> 49ffd98e0


AMBARI-14182 Recovery alerts do not go away (dsen)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/49ffd98e
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/49ffd98e
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/49ffd98e

Branch: refs/heads/trunk
Commit: 49ffd98e0a14a3ab1d461dc4013c8a8b77688411
Parents: e591e90
Author: Dmytro Sen <ds...@apache.org>
Authored: Fri Dec 4 18:24:57 2015 +0200
Committer: Dmytro Sen <ds...@apache.org>
Committed: Fri Dec 4 18:24:57 2015 +0200

----------------------------------------------------------------------
 .../main/python/ambari_agent/RecoveryManager.py | 32 ++++++++--
 .../ambari_agent/alerts/recovery_alert.py       | 16 +++--
 .../src/test/python/ambari_agent/TestAlerts.py  | 34 ++++++++++-
 .../python/ambari_agent/TestRecoveryManager.py  | 62 +++++++++++++++++++-
 ambari-server/conf/unix/ambari.properties       |  1 +
 5 files changed, 131 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py b/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py
index cd8e839..a7abb8a 100644
--- a/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py
+++ b/ambari-agent/src/main/python/ambari_agent/RecoveryManager.py
@@ -63,7 +63,7 @@ class RecoveryManager:
     "lastAttempt": 0,
     "count": 0,
     "lastReset": 0,
-    "lifetimeCount" : 0,
+    "lifetimeCount": 0,
     "warnedLastAttempt": False,
     "warnedLastReset": False,
     "warnedThresholdReached": False
@@ -105,7 +105,7 @@ class RecoveryManager:
 
     self.__actions_json_file = os.path.join(cache_dir, self.FILENAME)
 
-    self.actions = self._load_actions()
+    self.actions = {}
 
     self.update_config(6, 60, 5, 12, recovery_enabled, auto_start_only, "", "")
 
@@ -381,6 +381,11 @@ class RecoveryManager:
     executed = False
     seconds_since_last_attempt = now - action_counter["lastAttempt"]
     if action_counter["lifetimeCount"] < self.max_lifetime_count:
+      #reset if window_in_sec seconds passed since last attempt
+      if seconds_since_last_attempt > self.window_in_sec:
+        action_counter["count"] = 0
+        action_counter["lastReset"] = now
+        action_counter["warnedLastReset"] = False
       if action_counter["count"] < self.max_count:
         if seconds_since_last_attempt > self.retry_gap_in_sec:
           action_counter["count"] += 1
@@ -428,7 +433,7 @@ class RecoveryManager:
                     "Will silently skip execution without warning till window is reset",
                     action_counter["lifetimeCount"], action_name)
       else:
-        logger.debug("%s occurrences in agent life time reached the limit for %s",
+        logger.error("%s occurrences in agent life time reached the limit for %s",
                      action_counter["lifetimeCount"], action_name)
     self._dump_actions()
     return executed
@@ -474,13 +479,30 @@ class RecoveryManager:
 
   def get_actions_copy(self):
     """
-    Loads recovery actions from FS
     :return:  recovery actions copy
     """
-    return self._load_actions()
+    self.__status_lock.acquire()
+    try:
+      return copy.deepcopy(self.actions)
+    finally:
+      self.__status_lock.release()
     pass
 
 
+  def is_action_info_stale(self, action_name):
+    """
+    Checks if the action info is stale
+    :param action_name:
+    :return: if the action info for action_name: is stale
+    """
+    if action_name in self.actions:
+      action_counter = self.actions[action_name]
+      now = self._now_()
+      seconds_since_last_attempt = now - action_counter["lastAttempt"]
+      return seconds_since_last_attempt > self.window_in_sec
+    return False
+    pass
+
   def _execute_action_chk_only(self, action_name):
     """
     _private_ implementation of [may] execute check only

http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py
index 60744b5..760a737 100644
--- a/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py
+++ b/ambari-agent/src/main/python/ambari_agent/alerts/recovery_alert.py
@@ -69,16 +69,20 @@ class RecoveryAlert(BaseAlert):
     if component in recovery_actions:
       recovery_action_info = recovery_actions[component]
 
-    recovered_times = 0
-    if 'count' in recovery_action_info:
-      recovered_times = recovery_action_info['count']
-    lastResetText = ""
-    if 'lastReset' in recovery_action_info:
-      lastResetText = " since " + str(datetime.datetime.fromtimestamp(recovery_action_info['lastReset']))
     warned_threshold_reached = False
     if 'warnedThresholdReached' in recovery_action_info:
       warned_threshold_reached = recovery_action_info['warnedThresholdReached']
 
+    recovered_times = 0
+    lastResetText = ""
+
+    # The alert should not go away if warned_threshold_reached (max_lifetime_count reached)
+    if not self.recovery_manager.is_action_info_stale(component) or warned_threshold_reached:
+      if 'count' in recovery_action_info:
+        recovered_times = recovery_action_info['count']
+      if 'lastReset' in recovery_action_info:
+        lastResetText = " since " + str(datetime.datetime.fromtimestamp(recovery_action_info['lastReset']))
+
     if recovered_times >= self.critical_count or warned_threshold_reached:
       result = self.RESULT_CRITICAL
     elif recovered_times >= self.warning_count:

http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
index a01bb2c..8344238 100644
--- a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
+++ b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
@@ -112,9 +112,11 @@ class TestAlerts(TestCase):
     self.assertEquals(0, len(collector.alerts()))
     self.assertEquals('CRITICAL', alerts[0]['state'])
 
+  @patch.object(RecoveryManager, "is_action_info_stale")
   @patch.object(RecoveryManager, "get_actions_copy")
-  def test_recovery_alert(self, rm_get_actions_mock):
+  def test_recovery_alert(self, rm_get_actions_mock, is_stale_mock):
     definition_json = self._get_recovery_alert_definition()
+    is_stale_mock.return_value = False
     rm_get_actions_mock.return_value = {
         "METRICS_COLLECTOR": {
           "count": 0,
@@ -177,6 +179,36 @@ class TestAlerts(TestCase):
     self.assertEquals(0, len(collector.alerts()))
     self.assertEquals('CRITICAL', alerts[0]['state'])
 
+    # OK again, after recovery manager window expired
+    is_stale_mock.return_value = True
+
+    alert.collect()
+    alerts = collector.alerts()
+    self.assertEquals(0, len(collector.alerts()))
+    self.assertEquals('OK', alerts[0]['state'])
+
+    #  CRIT, after recovery manager window expired,
+    # but max_lifetime_count reached, warnedThresholdReached == True
+    rm_get_actions_mock.return_value = {
+      "METRICS_COLLECTOR": {
+        "count": 5,
+        "lastAttempt": 1447860184,
+        "warnedLastReset": False,
+        "lastReset": 1447860184,
+        "warnedThresholdReached": True,
+        "lifetimeCount": 12,
+        "warnedLastAttempt": False
+      }
+    }
+
+    is_stale_mock.return_value = True
+
+    alert.collect()
+    alerts = collector.alerts()
+    self.assertEquals(0, len(collector.alerts()))
+    self.assertEquals('CRITICAL', alerts[0]['state'])
+
+
   @patch.object(socket.socket,"connect")
   def test_port_alert_complex_uri(self, socket_connect_mock):
     definition_json = self._get_port_alert_definition()

http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py b/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py
index 1335dab..a2b4968 100644
--- a/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py
+++ b/ambari-agent/src/test/python/ambari_agent/TestRecoveryManager.py
@@ -25,7 +25,7 @@ from ambari_agent.RecoveryManager import RecoveryManager
 from mock.mock import patch, MagicMock, call
 
 
-class TestRecoveryManager(TestCase):
+class _TestRecoveryManager(TestCase):
   command = {
     "commandType": "STATUS_COMMAND",
     "payloadLevel": "EXECUTION_COMMAND",
@@ -516,7 +516,7 @@ class TestRecoveryManager(TestCase):
   @patch.object(RecoveryManager, "_now_")
   def test_recovery_report(self, time_mock):
     time_mock.side_effect = \
-      [1000, 1071, 1072, 1470, 1471, 1472, 1543, 1644, 1715]
+      [1000, 1071, 1072, 1470, 1471, 1472, 1543, 1644, 1815]
 
     rm = RecoveryManager(tempfile.mktemp())
     rec_st = rm.get_recovery_status()
@@ -630,3 +630,61 @@ class TestRecoveryManager(TestCase):
     self.assertTrue(rm.configured_for_recovery("D"))
     self.assertFalse(rm.configured_for_recovery("E"))
     self.assertTrue(rm.configured_for_recovery("F"))
+
+  @patch.object(RecoveryManager, "_now_")
+  def test_reset_if_window_passed_since_last_attempt(self, time_mock):
+    time_mock.side_effect = \
+      [1000, 1071, 1372]
+    rm = RecoveryManager(tempfile.mktemp(), True)
+
+    rm.update_config(2, 5, 1, 4, True, True, "", "")
+
+    rm.execute("COMPONENT")
+    actions = rm.get_actions_copy()["COMPONENT"]
+    self.assertEquals(actions['lastReset'], 1000)
+    rm.execute("COMPONENT")
+    actions = rm.get_actions_copy()["COMPONENT"]
+    self.assertEquals(actions['lastReset'], 1000)
+    #reset if window_in_sec seconds passed since last attempt
+    rm.execute("COMPONENT")
+    actions = rm.get_actions_copy()["COMPONENT"]
+    self.assertEquals(actions['lastReset'], 1372)
+
+
+  @patch.object(RecoveryManager, "_now_")
+  def test_is_action_info_stale(self, time_mock):
+
+    rm = RecoveryManager(tempfile.mktemp(), True)
+    rm.update_config(5, 60, 5, 16, True, False, "", "")
+
+    # rm.actions = {}
+
+    time_mock.return_value = 0
+    self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME"))
+
+    rm.actions["COMPONENT_NAME"] = {
+      "lastAttempt": 0,
+      "count": 0,
+      "lastReset": 0,
+      "lifetimeCount": 0,
+      "warnedLastAttempt": False,
+      "warnedLastReset": False,
+      "warnedThresholdReached": False
+    }
+    time_mock.return_value = 3600
+    self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME"))
+
+    rm.actions["COMPONENT_NAME"] = {
+      "lastAttempt": 1,
+      "count": 1,
+      "lastReset": 0,
+      "lifetimeCount": 1,
+      "warnedLastAttempt": False,
+      "warnedLastReset": False,
+      "warnedThresholdReached": False
+    }
+    time_mock.return_value = 3601
+    self.assertFalse(rm.is_action_info_stale("COMPONENT_NAME"))
+
+    time_mock.return_value = 3602
+    self.assertTrue(rm.is_action_info_stale("COMPONENT_NAME"))

http://git-wip-us.apache.org/repos/asf/ambari/blob/49ffd98e/ambari-server/conf/unix/ambari.properties
----------------------------------------------------------------------
diff --git a/ambari-server/conf/unix/ambari.properties b/ambari-server/conf/unix/ambari.properties
index 3dac63e..fd1b91e 100644
--- a/ambari-server/conf/unix/ambari.properties
+++ b/ambari-server/conf/unix/ambari.properties
@@ -108,3 +108,4 @@ views.http.x-frame-options=SAMEORIGIN
 # Enable Metrics Collector auto-restart
 recovery.type=AUTO_START
 recovery.enabled_components=METRICS_COLLECTOR
+recovery.lifetime_max_count=1024
\ No newline at end of file