You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ja...@apache.org on 2016/02/26 00:46:06 UTC

ambari git commit: AMBARI-15105: Add alerts for HAWQ components status (bhuvnesh2703 via jaoki)

Repository: ambari
Updated Branches:
  refs/heads/trunk 8fb17ab09 -> 29115e81e


AMBARI-15105: Add alerts for HAWQ components status (bhuvnesh2703 via jaoki)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/29115e81
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/29115e81
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/29115e81

Branch: refs/heads/trunk
Commit: 29115e81ee2ba6643c7725903300f070c4ba2ea5
Parents: 8fb17ab
Author: Jun Aoki <ja...@apache.org>
Authored: Thu Feb 25 15:45:58 2016 -0800
Committer: Jun Aoki <ja...@apache.org>
Committed: Thu Feb 25 15:45:58 2016 -0800

----------------------------------------------------------------------
 .../common-services/HAWQ/2.0.0/alerts.json      |  93 +++++++++++-
 .../package/alerts/alert_component_status.py    |  76 ++++++++++
 .../2.3/HAWQ/test_alert_component_status.py     | 141 +++++++++++++++++++
 ambari-web/app/views/main/dashboard/widgets.js  |   2 +-
 4 files changed, 310 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/29115e81/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json
index 3119a0c..14ad6d7 100644
--- a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json
+++ b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json
@@ -1,5 +1,32 @@
 {
   "HAWQ": {
+    "service": [
+      {
+        "name": "hawq_segment_process_percent",
+        "label": "Percent HAWQ Segments Available",
+        "description": "This alert is triggered if the number of down HAWQ Segments in the cluster is greater than the configured critical threshold.",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "AGGREGATE",
+          "alert_name": "hawq_segment_process",
+          "reporting": {
+            "ok": {
+              "text": "affected: [{1}], total: [{0}]"
+            },
+            "warning": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.1
+            },
+            "critical": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.3
+            }
+          }
+        }
+      }
+    ],
     "HAWQMASTER": [
       {
         "name": "hawqstandby_sync_status",
@@ -13,7 +40,71 @@
           "path": "HAWQ/2.0.0/package/alerts/alert_sync_status.py",
           "parameters": []
         }
+      },
+      {
+        "name": "hawq_master_process",
+        "label": "HAWQ Master Process",
+        "description": "This alert is triggered if the HAWQ Master process cannot be confirmed to be up and listening on the network.",
+        "interval": 1,
+        "scope": "ANY",
+        "source": {
+          "type": "SCRIPT",
+          "path": "HAWQ/2.0.0/package/alerts/alert_component_status.py",
+          "parameters": [
+            {
+              "name": "component_name",
+              "display_name": "Component Name",
+              "value": "master",
+              "type": "STRING",
+              "description": "This text string indicates if it is a Master, Standby or Segment"
+            }
+          ]
+        }
+      }
+    ],
+    "HAWQSEGMENT": [
+      {
+        "name": "hawq_segment_process",
+        "label": "HAWQ Segment Process",
+        "description": "This host-level alert is triggered if the HAWQ Segment process cannot be confirmed to be up and listening on the network.",
+        "interval": 1,
+        "scope": "HOST",
+        "source": {
+          "type": "SCRIPT",
+          "path": "HAWQ/2.0.0/package/alerts/alert_component_status.py",
+          "parameters": [
+            {
+              "name": "component_name",
+              "display_name": "Component Name",
+              "value": "segment",
+              "type": "STRING",
+              "description": "This text string indicates if it is a Master, Standby or Segment"
+            }
+          ]
+        }
+      }
+    ],
+    "HAWQSTANDBY": [
+      {
+        "name": "hawq_standby_process",
+        "label": "HAWQ Standby Process",
+        "description": "This alert is triggered if the HAWQ Standby process cannot be confirmed to be up and listening on the network.",
+        "interval": 1,
+        "scope": "ANY",
+        "source": {
+          "type": "SCRIPT",
+          "path": "HAWQ/2.0.0/package/alerts/alert_component_status.py",
+          "parameters": [
+            {
+              "name": "component_name",
+              "display_name": "Component Name",
+              "value": "standby",
+              "type": "STRING",
+              "description": "This text string indicates if it is a Master, Standby or Segment"
+            }
+          ]
+        }
       }
     ]
   }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/ambari/blob/29115e81/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_component_status.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_component_status.py b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_component_status.py
new file mode 100644
index 0000000..9ca9ac6
--- /dev/null
+++ b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_component_status.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from resource_management.core.shell import call
+
+HAWQMASTER_PORT = '{{hawq-site/hawq_master_address_port}}'
+HAWQSEGMENT_PORT = '{{hawq-site/hawq_segment_address_port}}'
+HAWQSTANDBY_ADDRESS = '{{hawq-site/hawq_standby_address_host}}'
+
+RESULT_STATE_OK = 'OK'
+RESULT_STATE_UNKNOWN = 'UNKNOWN'
+RESULT_STATE_SKIPPED = 'SKIPPED'
+RESULT_STATE_CRITICAL = 'CRITICAL'
+
+COMPONENT_PROCESS_MAP = {
+                         "segment": "postgres",
+                         "master": "postgres",
+                         "standby": "gpsyncmaster"
+                        }
+
+
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used to build the dictionary passed into execute
+  """
+  return (HAWQMASTER_PORT, HAWQSEGMENT_PORT, HAWQSTANDBY_ADDRESS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  if configurations is None:
+    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
+
+  component = parameters['component_name']
+  # Identify port of the process
+  port = configurations[HAWQSEGMENT_PORT] if component == "segment" else configurations[HAWQMASTER_PORT]
+
+  component_name = component.capitalize()
+  is_running = is_component_running(port, COMPONENT_PROCESS_MAP[component])
+  if is_running:
+    return (RESULT_STATE_OK, ['HAWQ {0} is running'.format(component_name)])
+  else:
+    return (RESULT_STATE_CRITICAL, ['HAWQ {0} is not running'.format(component_name)])
+
+def is_component_running(port, process):
+  """
+  Check if the process is running on the specified port
+  """
+  cmd = "netstat -tupln | egrep ':{0}\s' | egrep {1}".format(port, process)
+  rc, op= call(cmd, timeout=60)
+  return rc == 0

http://git-wip-us.apache.org/repos/asf/ambari/blob/29115e81/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_component_status.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_component_status.py b/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_component_status.py
new file mode 100644
index 0000000..b2e1d4d
--- /dev/null
+++ b/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_component_status.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+
+'''
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+# System imports
+import os
+import sys
+
+from mock.mock import patch
+
+# Local imports
+from stacks.utils.RMFTestCase import *
+
+COMMON_SERVICES_ALERTS_DIR = "HAWQ/2.0.0/package/alerts"
+
+file_path = os.path.dirname(os.path.abspath(__file__))
+file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file_path)))))
+file_path = os.path.join(file_path, "main", "resources", "common-services", COMMON_SERVICES_ALERTS_DIR)
+
+WORKING_CONFIGS = {
+                    "{{hawq-site/hawq_master_address_port}}": "5432",
+                    "{{hawq-site/hawq_segment_address_port}}": "40000",
+                    "{{hawq-site/hawq_standby_address_host}}": "c6402.ambari.apache.org"
+                  }
+
+class TestAlertComponentStatus(RMFTestCase):
+
+  def setUp(self):
+    """
+    Import the class under test.
+    Because the class is present in a different folder, append its dir to the system path.
+    Also, shorten the import name and make it a global so the test functions can access it.
+    :return:
+    """
+    sys.path.append(file_path)
+    global alert_component_status
+    import alert_component_status
+
+  def test_missing_configs(self):
+    """
+    Check that the status is UNKNOWN when configs are missing.
+    """
+    configs = None
+    [status, messages] = alert_component_status.execute(configurations=configs)
+    self.assertEqual(status, alert_component_status.RESULT_STATE_UNKNOWN)
+    self.assertTrue(messages is not None and len(messages) == 1)
+    self.assertEqual(messages[0], 'There were no configurations supplied to the script.')
+
+  @patch("alert_component_status.is_component_running")
+  def test_hawq_master_ok(self, is_component_running_mock):
+    """
+    Test that the status is OK when HAWQ Master is up
+    """
+    # Mock calls
+    is_component_running_mock.return_value = True
+
+    [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'master'})
+    self.assertEqual(status, alert_component_status.RESULT_STATE_OK)
+    self.assertTrue(messages is not None and len(messages) == 1)
+    self.assertEqual(messages[0], 'HAWQ Master is running')
+
+  @patch("alert_component_status.is_component_running")
+  def test_hawq_master_critical(self, is_component_running_mock):
+    """
+    Test that the status is CRITICIAL when HAWQ Master is down
+    """
+    # Mock calls
+    is_component_running_mock.return_value = False
+
+    [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'master'})
+    self.assertEqual(status, alert_component_status.RESULT_STATE_CRITICAL)
+    self.assertTrue(messages is not None and len(messages) == 1)
+    self.assertEqual(messages[0], 'HAWQ Master is not running')
+
+  @patch("alert_component_status.is_component_running")
+  def test_hawq_standby_ok(self, is_component_running_mock):
+    """
+    Test that the status is OK when HAWQ Standby is up
+    """
+    # Mock calls
+    is_component_running_mock.return_value = True
+
+    [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'standby'})
+    self.assertEqual(status, alert_component_status.RESULT_STATE_OK)
+    self.assertTrue(messages is not None and len(messages) == 1)
+    self.assertEqual(messages[0], 'HAWQ Standby is running')
+
+  @patch("alert_component_status.is_component_running")
+  def test_hawq_standby_critical(self, is_component_running_mock):
+    """
+    Test that the status is CRITICIAL when HAWQ Standby is down
+    """
+    # Mock calls
+    is_component_running_mock.return_value = False
+
+    [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'standby'})
+    self.assertEqual(status, alert_component_status.RESULT_STATE_CRITICAL)
+    self.assertTrue(messages is not None and len(messages) == 1)
+    self.assertEqual(messages[0], 'HAWQ Standby is not running')
+
+  @patch("alert_component_status.is_component_running")
+  def test_hawq_segment_ok(self, is_component_running_mock):
+    """
+    Test that the status is OK when HAWQ Segment is up
+    """
+    # Mock calls
+    is_component_running_mock.return_value = True
+
+    [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'segment'})
+    self.assertEqual(status, alert_component_status.RESULT_STATE_OK)
+    self.assertTrue(messages is not None and len(messages) == 1)
+    self.assertEqual(messages[0], 'HAWQ Segment is running')
+
+  @patch("alert_component_status.is_component_running")
+  def test_hawq_segment_critical(self, is_component_running_mock):
+    """
+    Test that the status is CRITICIAL when HAWQ Segment is down
+    """
+    # Mock calls
+    is_component_running_mock.return_value = False
+
+    [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'segment'})
+    self.assertEqual(status, alert_component_status.RESULT_STATE_CRITICAL)
+    self.assertTrue(messages is not None and len(messages) == 1)
+    self.assertEqual(messages[0], 'HAWQ Segment is not running')

http://git-wip-us.apache.org/repos/asf/ambari/blob/29115e81/ambari-web/app/views/main/dashboard/widgets.js
----------------------------------------------------------------------
diff --git a/ambari-web/app/views/main/dashboard/widgets.js b/ambari-web/app/views/main/dashboard/widgets.js
index c6723be..04be7bb 100644
--- a/ambari-web/app/views/main/dashboard/widgets.js
+++ b/ambari-web/app/views/main/dashboard/widgets.js
@@ -478,7 +478,7 @@ App.MainDashboardWidgetsView = Em.View.extend(App.UserPref, App.LocalStorage, Ap
     visible: [],
     hidden: [],
     threshold: {1: [80, 90], 2: [85, 95], 3: [90, 95], 4: [80, 90], 5: [1000, 3000], 6: [], 7: [], 8: [], 9: [], 10: [], 11: [], 12: [], 13: [70, 90], 14: [150, 250], 15: [3, 10], 16: [],
-      17: [70, 90], 18: [], 19: [50, 75], 20: [50, 75], 21: [85, 95], 22: [85, 95], 23: [], 24: [80, 90]} // id:[thresh1, thresh2]
+      17: [70, 90], 18: [], 19: [50, 75], 20: [50, 75], 21: [85, 95], 22: [85, 95], 23: [], 24: [70, 90]} // id:[thresh1, thresh2]
   }),
 
   /**