You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by jo...@apache.org on 2015/08/04 16:40:38 UTC

ambari git commit: AMBARI-12622 - Malformed Alert Data Can Prevent Alerts From Reporting (jonathanhurley)

Repository: ambari
Updated Branches:
  refs/heads/trunk b5bd53a10 -> 5dbb0e1b8


AMBARI-12622 - Malformed Alert Data Can Prevent Alerts From Reporting (jonathanhurley)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/5dbb0e1b
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/5dbb0e1b
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/5dbb0e1b

Branch: refs/heads/trunk
Commit: 5dbb0e1b87a8720a1e20920190a552193ed90bf4
Parents: b5bd53a
Author: Jonathan Hurley <jh...@hortonworks.com>
Authored: Mon Aug 3 11:20:52 2015 -0400
Committer: Jonathan Hurley <jh...@hortonworks.com>
Committed: Tue Aug 4 10:40:31 2015 -0400

----------------------------------------------------------------------
 .../ambari_agent/AlertSchedulerHandler.py       |  45 ++++---
 .../python/ambari_agent/alerts/base_alert.py    |  38 ++++--
 .../python/ambari_agent/alerts/metric_alert.py  |   2 +-
 .../src/test/python/ambari_agent/TestAlerts.py  | 134 +++++++++++++++++++
 4 files changed, 190 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/5dbb0e1b/ambari-agent/src/main/python/ambari_agent/AlertSchedulerHandler.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/AlertSchedulerHandler.py b/ambari-agent/src/main/python/ambari_agent/AlertSchedulerHandler.py
index c41397d..cddee57 100644
--- a/ambari-agent/src/main/python/ambari_agent/AlertSchedulerHandler.py
+++ b/ambari-agent/src/main/python/ambari_agent/AlertSchedulerHandler.py
@@ -260,28 +260,33 @@ class AlertSchedulerHandler():
     converts the json that represents all aspects of a definition
     and makes an object that extends BaseAlert that is used for individual
     """
-    source = json_definition['source']
-    source_type = source.get('type', '')
-
-    if logger.isEnabledFor(logging.DEBUG):
-      logger.debug("[AlertScheduler] Creating job type {0} with {1}".format(source_type, str(json_definition)))
-
     alert = None
 
-    if source_type == AlertSchedulerHandler.TYPE_METRIC:
-      alert = MetricAlert(json_definition, source, self.config)
-    elif source_type == AlertSchedulerHandler.TYPE_PORT:
-      alert = PortAlert(json_definition, source)
-    elif source_type == AlertSchedulerHandler.TYPE_SCRIPT:
-      source['stacks_directory'] = self.stacks_dir
-      source['common_services_directory'] = self.common_services_dir
-      source['host_scripts_directory'] = self.host_scripts_dir
-      alert = ScriptAlert(json_definition, source, self.config)
-    elif source_type == AlertSchedulerHandler.TYPE_WEB:
-      alert = WebAlert(json_definition, source, self.config)
-
-    if alert is not None:
-      alert.set_cluster(clusterName, hostName)
+    try:
+      source = json_definition['source']
+      source_type = source.get('type', '')
+
+      if logger.isEnabledFor(logging.DEBUG):
+        logger.debug("[AlertScheduler] Creating job type {0} with {1}".format(source_type, str(json_definition)))
+
+
+      if source_type == AlertSchedulerHandler.TYPE_METRIC:
+        alert = MetricAlert(json_definition, source, self.config)
+      elif source_type == AlertSchedulerHandler.TYPE_PORT:
+        alert = PortAlert(json_definition, source)
+      elif source_type == AlertSchedulerHandler.TYPE_SCRIPT:
+        source['stacks_directory'] = self.stacks_dir
+        source['common_services_directory'] = self.common_services_dir
+        source['host_scripts_directory'] = self.host_scripts_dir
+        alert = ScriptAlert(json_definition, source, self.config)
+      elif source_type == AlertSchedulerHandler.TYPE_WEB:
+        alert = WebAlert(json_definition, source, self.config)
+
+      if alert is not None:
+        alert.set_cluster(clusterName, hostName)
+
+    except Exception,exception:
+      logger.exception("[AlertScheduler] Unable to load an invalid alert definition. It will be skipped.")
 
     return alert
 

http://git-wip-us.apache.org/repos/asf/ambari/blob/5dbb0e1b/ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py
index 9151796..fd6b03c 100644
--- a/ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py
+++ b/ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py
@@ -145,23 +145,45 @@ class BaseAlert(object):
     
     if logger.isEnabledFor(logging.DEBUG):
       logger.debug("[Alert][{0}] result = {1}".format(self.get_name(), str(res)))
-      
+
     data = {}
     data['name'] = self._get_alert_meta_value_safely('name')
     data['label'] = self._get_alert_meta_value_safely('label')
-    data['state'] = res[0]
-    data['text'] = res_base_text.format(*res[1])
+    data['uuid'] = self._get_alert_meta_value_safely('uuid')
     data['cluster'] = self.cluster_name
     data['service'] = self._get_alert_meta_value_safely('serviceName')
     data['component'] = self._get_alert_meta_value_safely('componentName')
     data['timestamp'] = long(time.time() * 1000)
-    data['uuid'] = self._get_alert_meta_value_safely('uuid')
     data['enabled'] = self._get_alert_meta_value_safely('enabled')
 
-    if logger.isEnabledFor(logging.DEBUG):
-      logger.debug("[Alert][{0}] text = {1}".format(self.get_name(), data['text']))
-    
-    self.collector.put(self.cluster_name, data)
+    try:
+      data['state'] = res[0]
+
+      # * is the splat operator, which flattens a collection into positional arguments
+      # flatten the array and then try formatting it
+      try:
+        data['text'] = res_base_text.format(*res[1])
+      except ValueError, value_error:
+        logger.warn("[Alert][{0}] - {1}".format(self.get_name(), str(value_error)))
+
+        # if there is a ValueError, it's probably because the text doesn't match the type of
+        # positional arguemtns (ie {0:d} with a float)
+        res_base_text = res_base_text.replace("d}", "s}")
+        data_as_strings = map(str, res[1])
+        data['text'] = res_base_text.format(*data_as_strings)
+
+      if logger.isEnabledFor(logging.DEBUG):
+        logger.debug("[Alert][{0}] text = {1}".format(self.get_name(), data['text']))
+    except Exception, exception:
+      logger.exception("[Alert][{0}] - The alert's data is not properly formatted".format(self.get_name()))
+
+      # if there's a problem with getting the data returned from collect() then mark this
+      # alert as UNKNOWN
+      data['state'] = self.RESULT_UNKNOWN
+      data['text'] = "There is a problem with the alert definition: {0}".format(str(exception))
+    finally:
+      # put the alert into the collector so it can be collected on the next run
+      self.collector.put(self.cluster_name, data)
 
 
   def _get_configuration_value(self, key):

http://git-wip-us.apache.org/repos/asf/ambari/blob/5dbb0e1b/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
index 6bcdacd..aa4ad75 100644
--- a/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
+++ b/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
@@ -96,7 +96,7 @@ class MetricAlert(BaseAlert):
         collect_result = self.RESULT_UNKNOWN
         value_list.append('HTTP {0} response (metrics unavailable)'.format(str(http_code)))
       elif not jmx_property_values and http_code not in [200, 307]:
-        raise Exception("[Alert][{0}] Unable to get json from jmx response!".format(self.get_name()))
+        raise Exception("[Alert][{0}] Unable to extract JSON from JMX response".format(self.get_name()))
       else:
         value_list.extend(jmx_property_values)
         check_value = self.metric_info.calculate(value_list)

http://git-wip-us.apache.org/repos/asf/ambari/blob/5dbb0e1b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
index 7b64445..dab717d 100644
--- a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
+++ b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
@@ -1037,6 +1037,82 @@ class TestAlerts(TestCase):
     self.assertEquals('(Unit Tests) OK: 10 5 2.0', alerts[0]['text'])
 
 
+
+  @patch.object(socket.socket,"connect")
+  def test_alert_definition_value_error_conversion(self, socket_connect_mock):
+    """
+    Tests that an alert definition with text that doesn't match the type of positional arguments
+    can recover and retry the ValueError.
+    :param socket_connect_mock:
+    :return:
+    """
+    definition_json = self._get_alert_definition_with_value_error_text()
+
+    configuration = {'hdfs-site' :
+      { 'my-key': 'c6401.ambari.apache.org:2181'}
+    }
+
+    collector = AlertCollector()
+    cluster_configuration = self.__get_cluster_configuration()
+    self.__update_cluster_configuration(cluster_configuration, configuration)
+
+    alert = PortAlert(definition_json, definition_json['source'])
+    alert.set_helpers(collector, cluster_configuration)
+    alert.set_cluster("c1", "c6402.ambari.apache.org")
+
+    # use a URI that has commas to verify that we properly parse it
+    alert.set_helpers(collector, cluster_configuration)
+    alert.set_cluster("c1", "c6401.ambari.apache.org")
+    self.assertEquals(6, alert.interval())
+
+    # the collect should catch the invalid text in the definition
+    # ValueError: Unknown format code 'd' for object of type 'float'
+    alert.collect()
+
+    alerts = collector.alerts()
+    self.assertEquals(0, len(collector.alerts()))
+
+    self.assertEquals('OK', alerts[0]['state'])
+    self.assertTrue('(Unit Tests) TCP OK' in alerts[0]['text'])
+
+
+  @patch.object(socket.socket,"connect")
+  def test_alert_definition_too_many_positional_arguments(self, socket_connect_mock):
+    """
+    Tests that an alert definition with too many arguments produces an alert to collect after the
+    exceptioin is raised.
+    :param socket_connect_mock:
+    :return:
+    """
+    definition_json = self._get_alert_definition_with_too_many_positional_arguments()
+
+    configuration = {'hdfs-site' :
+      { 'my-key': 'c6401.ambari.apache.org:2181'}
+    }
+
+    collector = AlertCollector()
+    cluster_configuration = self.__get_cluster_configuration()
+    self.__update_cluster_configuration(cluster_configuration, configuration)
+
+    alert = PortAlert(definition_json, definition_json['source'])
+    alert.set_helpers(collector, cluster_configuration)
+    alert.set_cluster("c1", "c6402.ambari.apache.org")
+
+    # use a URI that has commas to verify that we properly parse it
+    alert.set_helpers(collector, cluster_configuration)
+    alert.set_cluster("c1", "c6401.ambari.apache.org")
+    self.assertEquals(6, alert.interval())
+
+    # the collect should catch the invalid text in the definition
+    # ValueError: Unknown format code 'd' for object of type 'float'
+    alert.collect()
+
+    alerts = collector.alerts()
+    self.assertEquals(0, len(collector.alerts()))
+
+    self.assertEquals('UNKNOWN', alerts[0]['state'])
+    self.assertTrue('There is a problem with the alert definition' in alerts[0]['text'])
+
   def __get_cluster_configuration(self):
     """
     Gets an instance of the cluster cache where the file read and write
@@ -1251,6 +1327,64 @@ class TestAlerts(TestCase):
       }
     }
 
+  def _get_alert_definition_with_value_error_text(self):
+    return { "name": "namenode_process",
+      "service": "HDFS",
+      "component": "NAMENODE",
+      "label": "NameNode process",
+      "interval": 6,
+      "scope": "host",
+      "enabled": True,
+      "uuid": "c1f73191-4481-4435-8dae-fd380e4c0be1",
+      "source": {
+        "type": "PORT",
+        "uri": "{{hdfs-site/my-key}}",
+        "default_port": 50070,
+        "reporting": {
+          "ok": {
+            "text": "(Unit Tests) TCP OK {0:.4d}"
+          },
+          "warning": {
+            "text": "(Unit Tests) TCP Warning {0:.4d}",
+            "value": 1.5
+          },
+          "critical": {
+            "text": "(Unit Tests) TCP Critical {0:.4d}",
+            "value": 5.0
+          }
+        }
+      }
+    }
+
+  def _get_alert_definition_with_too_many_positional_arguments(self):
+    return { "name": "namenode_process",
+      "service": "HDFS",
+      "component": "NAMENODE",
+      "label": "NameNode process",
+      "interval": 6,
+      "scope": "host",
+      "enabled": True,
+      "uuid": "c1f73191-4481-4435-8dae-fd380e4c0be1",
+      "source": {
+        "type": "PORT",
+        "uri": "{{hdfs-site/my-key}}",
+        "default_port": 50070,
+        "reporting": {
+          "ok": {
+            "text": "Bad Syntax Going To Mess You Up {0:.4d} {1} {2} {3} {4}"
+          },
+          "warning": {
+            "text": "Bad Syntax Going To Mess You Up {0:.4d} {1} {2} {3} {4}",
+            "value": 1.5
+          },
+          "critical": {
+            "text": "Bad Syntax Going To Mess You Up {0:.4d} {1} {2} {3} {4}",
+            "value": 5.0
+          }
+        }
+      }
+    }
+
 class MockAlert(BaseAlert):
   """
   Mock class for testing