You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by jo...@apache.org on 2014/09/18 16:44:13 UTC
[24/34] git commit: AMBARI-7284 - (Apache AMBARI-7284) Hadoop cluster
alerts need updates for Hadoop 2.4 and 2.5
AMBARI-7284 - (Apache AMBARI-7284) Hadoop cluster alerts need updates for Hadoop 2.4 and 2.5
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/a14ca238
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/a14ca238
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/a14ca238
Branch: refs/heads/branch-alerts-dev
Commit: a14ca23882a76c1daa27038c8e00a20d231ce55f
Parents: 3f932cf
Author: Artem Baranchuk <ab...@hortonworks.com>
Authored: Mon Sep 15 19:50:29 2014 +0300
Committer: Artem Baranchuk <ab...@hortonworks.com>
Committed: Thu Sep 18 13:47:48 2014 +0300
----------------------------------------------------------------------
.../services/NAGIOS/package/files/sys_logger.py | 30 +++++---
.../test/nagios/plugins/test_sys_logger.py | 77 ++++++++++++++++++--
2 files changed, 91 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/a14ca238/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py
index 8f0a415..e86a8fb 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py
@@ -114,14 +114,24 @@ msg_ids = {'Host::Ping':'host_down',
'GANGLIA::Ganglia Monitor process for ResourceManager':'ganglia_monitor_process',
'GANGLIA::Ganglia Monitor process for HistoryServer':'ganglia_monitor_process',
'HBASEMASTER::HBase Master process':'hbase_master_process',
+ 'HBASE::Percent RegionServers live':'regionservers_down',
'REGIONSERVER::RegionServer process':'regionserver_process',
'NAGIOS::Nagios status log freshness':'nagios_process',
'FLUME::Flume Agent process':'flume_agent_process',
'OOZIE::Oozie Server status':'oozie_server_process',
'HIVE-METASTORE::Hive Metastore status':'hive_metastore_process',
- 'WEBHCAT::WebHCat Server status':'webhcat_server_process',
- 'RESOURCEMANAGER::ResourceManager process':'resourcemanager_process',
- 'NODEMANAGER::NodeManager process':'nodemanager_process',
+ 'WEBHCAT::WebHCat Server status':'webhcat_down',
+ 'RESOURCEMANAGER::ResourceManager process':'resourcemanager_process_down',
+ 'RESOURCEMANAGER::ResourceManager RPC latency':'resourcemanager_rpc_latency',
+ 'RESOURCEMANAGER::ResourceManager CPU utilization':'resourcemanager_cpu_utilization',
+ 'RESOURCEMANAGER::ResourceManager Web UI':'recourcemanager_ui',
+ 'NODEMANAGER::NodeManager process':'nodemanager_process_down',
+ 'NODEMANAGER::NodeManager health':'nodemanager_health',
+ 'NODEMANAGER::Percent NodeManagers live':'nodemanagers_down',
+ 'APP_TIMELINE_SERVER::App Timeline Server process':'timelineserver_process',
+ 'JOBHISTORY::HistoryServer RPC latency':'historyserver_rpc_latency',
+ 'JOBHISTORY::HistoryServer CPU utilization':'historyserver_cpu_utilization',
+ 'JOBHISTORY::HistoryServer Web UI':'historyserver_ui',
'JOBHISTORY::HistoryServer process':'historyserver_process'}
# Determine the severity of the TVI alert based on the Nagios alert state.
@@ -142,13 +152,13 @@ def determine_severity(state, service):
# Determine the msg id for the TVI alert from based on the service which generates the Nagios alert.
# The msg id is used to correlate a log msg to a TVI rule.
def determine_msg_id(service, severity):
- if msg_ids.has_key(service):
- msg_id = msg_ids[service]
- if severity == 'OK':
- msg_id = '{0}_ok'.format(msg_id)
-
- return msg_id
- else: return 'HADOOP_UNKNOWN_MSG'
+ for k, v in msg_ids.iteritems():
+ if(k in service):
+ msg_id = v
+ if severity == 'OK':
+ msg_id = '{0}_ok'.format(msg_id)
+ return msg_id
+ return 'HADOOP_UNKNOWN_MSG'
# Determine the domain. Currently the domain is always 'Hadoop'.
http://git-wip-us.apache.org/repos/asf/ambari/blob/a14ca238/contrib/addons/test/nagios/plugins/test_sys_logger.py
----------------------------------------------------------------------
diff --git a/contrib/addons/test/nagios/plugins/test_sys_logger.py b/contrib/addons/test/nagios/plugins/test_sys_logger.py
index eb7a8fe..49c5de8 100644
--- a/contrib/addons/test/nagios/plugins/test_sys_logger.py
+++ b/contrib/addons/test/nagios/plugins/test_sys_logger.py
@@ -259,6 +259,13 @@ test('Hadoop_RegionServer_Down:OK',
'OK: Hadoop: regionservers_down_ok# SERVICE MSG',
'HARD', '1', 'OK', 'HBASE::Percent region servers down', 'SERVICE MSG')
+test('HBASE_RegionServer_live',
+ 'Critical: Hadoop: regionservers_down# SERVICE MSG',
+ 'HARD', '1', 'CRITICAL', 'HBASE::Percent RegionServers live', 'SERVICE MSG')
+test('HBASE_RegionServer_live:OK',
+ 'OK: Hadoop: regionservers_down_ok# SERVICE MSG',
+ 'HARD', '1', 'OK', 'HBASE::Percent RegionServers live', 'SERVICE MSG')
+
# Hadoop_Hive_Metastore_Process_Down
test('Hadoop_Hive_Metastore_Process_Down',
'Critical: Hadoop: hive_metastore_process_down# SERVICE MSG',
@@ -548,26 +555,48 @@ test('Hive_Metastore_status:OK',
'HARD', '1', 'OK', 'HIVE-METASTORE::Hive Metastore status', 'SERVICE MSG')
test('WebHCat_Server_status',
- 'Critical: Hadoop: webhcat_server_process# SERVICE MSG',
+ 'Critical: Hadoop: webhcat_down# SERVICE MSG',
'HARD', '1', 'CRITICAL', 'WEBHCAT::WebHCat Server status', 'SERVICE MSG')
test('WebHCat_Server_status:OK',
- 'OK: Hadoop: webhcat_server_process_ok# SERVICE MSG',
+ 'OK: Hadoop: webhcat_down_ok# SERVICE MSG',
'HARD', '1', 'OK', 'WEBHCAT::WebHCat Server status', 'SERVICE MSG')
test('ResourceManager_process',
- 'Critical: Hadoop: resourcemanager_process# SERVICE MSG',
+ 'Critical: Hadoop: resourcemanager_process_down# SERVICE MSG',
'HARD', '1', 'CRITICAL', 'RESOURCEMANAGER::ResourceManager process', 'SERVICE MSG')
test('ResourceManager_process:OK',
- 'OK: Hadoop: resourcemanager_process_ok# SERVICE MSG',
+ 'OK: Hadoop: resourcemanager_process_down_ok# SERVICE MSG',
'HARD', '1', 'OK', 'RESOURCEMANAGER::ResourceManager process', 'SERVICE MSG')
+test('AppTimeline_process',
+ 'Critical: Hadoop: timelineserver_process# SERVICE MSG',
+ 'HARD', '1', 'CRITICAL', 'APP_TIMELINE_SERVER::App Timeline Server process', 'SERVICE MSG')
+test('AppTimeline_process:OK',
+ 'OK: Hadoop: timelineserver_process_ok# SERVICE MSG',
+ 'HARD', '1', 'OK', 'APP_TIMELINE_SERVER::App Timeline Server process', 'SERVICE MSG')
+
test('NodeManager_process',
- 'Critical: Hadoop: nodemanager_process# SERVICE MSG',
+ 'Critical: Hadoop: nodemanager_process_down# SERVICE MSG',
'HARD', '1', 'CRITICAL', 'NODEMANAGER::NodeManager process', 'SERVICE MSG')
test('NodeManager_process:OK',
- 'OK: Hadoop: nodemanager_process_ok# SERVICE MSG',
+ 'OK: Hadoop: nodemanager_process_down_ok# SERVICE MSG',
'HARD', '1', 'OK', 'NODEMANAGER::NodeManager process', 'SERVICE MSG')
+test('NodeManager_health',
+ 'Critical: Hadoop: nodemanager_health# SERVICE MSG',
+ 'HARD', '1', 'CRITICAL', 'NODEMANAGER::NodeManager health', 'SERVICE MSG')
+test('NodeManager_health:OK',
+ 'OK: Hadoop: nodemanager_health_ok# SERVICE MSG',
+ 'HARD', '1', 'OK', 'NODEMANAGER::NodeManager health', 'SERVICE MSG')
+
+test('NodeManager_live',
+ 'Critical: Hadoop: nodemanagers_down# SERVICE MSG',
+ 'HARD', '1', 'CRITICAL', 'NODEMANAGER::Percent NodeManagers live', 'SERVICE MSG')
+test('NodeManager_live:OK',
+ 'OK: Hadoop: nodemanagers_down_ok# SERVICE MSG',
+ 'HARD', '1', 'OK', 'NODEMANAGER::Percent NodeManagers live', 'SERVICE MSG')
+
+
test('HistoryServer_process',
'Critical: Hadoop: historyserver_process# SERVICE MSG',
'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer process', 'SERVICE MSG')
@@ -575,5 +604,41 @@ test('HistoryServer_process:OK',
'OK: Hadoop: historyserver_process_ok# SERVICE MSG',
'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer process', 'SERVICE MSG')
+test('HistoryServer_RPC_latency',
+ 'Critical: Hadoop: historyserver_rpc_latency# SERVICE MSG',
+ 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer RPC latency', 'SERVICE MSG')
+test('HistoryServer_RPC_latency:OK',
+ 'OK: Hadoop: historyserver_rpc_latency_ok# SERVICE MSG',
+ 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer RPC latency', 'SERVICE MSG')
+
+test('HistoryServer_CPU_utilization',
+ 'Critical: Hadoop: historyserver_cpu_utilization# SERVICE MSG',
+ 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer CPU utilization', 'SERVICE MSG')
+test('HistoryServer_CPU_utilization:OK',
+ 'OK: Hadoop: historyserver_cpu_utilization_ok# SERVICE MSG',
+ 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer CPU utilization', 'SERVICE MSG')
+
+test('HistoryServer_Web_UI',
+ 'Critical: Hadoop: historyserver_ui# SERVICE MSG',
+ 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer Web UI', 'SERVICE MSG')
+test('HistoryServer_Web_UI:OK',
+ 'OK: Hadoop: historyserver_ui_ok# SERVICE MSG',
+ 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer Web UI', 'SERVICE MSG')
+
+test('ResourceManager_rpc_latency',
+ 'Critical: Hadoop: resourcemanager_rpc_latency# SERVICE MSG',
+ 'HARD', '1', 'CRITICAL', 'RESOURCEMANAGER::ResourceManager RPC latency', 'SERVICE MSG')
+test('ResourceManager_rpc_latency:OK',
+ 'OK: Hadoop: resourcemanager_rpc_latency_ok# SERVICE MSG',
+ 'HARD', '1', 'OK', 'RESOURCEMANAGER::ResourceManager RPC latency', 'SERVICE MSG')
+
+test('ResourceManager_cpu_utilization',
+ 'Critical: Hadoop: resourcemanager_cpu_utilization# SERVICE MSG',
+ 'HARD', '1', 'CRITICAL', 'RESOURCEMANAGER::ResourceManager CPU utilization', 'SERVICE MSG')
+test('ResourceManager_cpu_utilization:OK',
+ 'OK: Hadoop: resourcemanager_cpu_utilization_ok# SERVICE MSG',
+ 'HARD', '1', 'OK', 'RESOURCEMANAGER::ResourceManager CPU utilization', 'SERVICE MSG')
+
+
summary()