You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ra...@apache.org on 2012/05/17 22:11:03 UTC
svn commit: r1339838 - in /incubator/ambari/branches/ambari-186: CHANGES.txt
hmc/puppet/modules/hdp-nagios/templates/hadoop-hosts.cfg.erb
hmc/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
Author: ramya
Date: Thu May 17 20:11:02 2012
New Revision: 1339838
URL: http://svn.apache.org/viewvc?rev=1339838&view=rev
Log:
AMBARI-270. Specifiy the notification intervals and options for Alerts. Contributed by Suhas
Modified:
incubator/ambari/branches/ambari-186/CHANGES.txt
incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-hosts.cfg.erb
incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
Modified: incubator/ambari/branches/ambari-186/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/ambari/branches/ambari-186/CHANGES.txt?rev=1339838&r1=1339837&r2=1339838&view=diff
==============================================================================
--- incubator/ambari/branches/ambari-186/CHANGES.txt (original)
+++ incubator/ambari/branches/ambari-186/CHANGES.txt Thu May 17 20:11:02 2012
@@ -2,6 +2,9 @@ Ambari Change log
Release 0.x.x - unreleased
+ AMBARI-270. Specifiy the notification intervals and options for Alerts
+ (vgogate via ramya)
+
AMBARI-300. Change the status message (success/error) location so that it
shows below the page summary box, rather than above, more better visibility
(Yusaku Sako via ramya)
Modified: incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-hosts.cfg.erb
URL: http://svn.apache.org/viewvc/incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-hosts.cfg.erb?rev=1339838&r1=1339837&r2=1339838&view=diff
==============================================================================
--- incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-hosts.cfg.erb (original)
+++ incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-hosts.cfg.erb Thu May 17 20:11:02 2012
@@ -7,6 +7,10 @@ define host {
check_interval 0.25
retry_interval 0.25
max_check_attempts 4
+ notifications_enabled 1
+ first_notification_delay 0 # Send notification soon after change in the hard state
+ notification_interval 0 # Send the notification once
+ notification_options d,u,r
}
<%end%>
Modified: incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
URL: http://svn.apache.org/viewvc/incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb?rev=1339838&r1=1339837&r2=1339838&view=diff
==============================================================================
--- incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb (original)
+++ incubator/ambari/branches/ambari-186/hmc/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb Thu May 17 20:11:02 2012
@@ -1,8 +1,16 @@
# NAGIOS SERVER Check (status log update)
<%if scope.function_hdp_nagios_members_exist('nagios-server')-%>
+define service {
+ name hadoop-service
+ use generic-service
+ notification_options w,u,c
+ first_notification_delay 0
+ notification_interval 0 # Send the notification once
+}
+
define service {
hostgroup_name nagios-server
- use generic-service
+ use hadoop-service
service_description NAGIOS::Nagios status log staleness
check_command check_nagios!10!/var/nagios/status.dat!/usr/bin/nagios
normal_check_interval 5
@@ -13,7 +21,7 @@ define service {
# NAGIOS SERVER HDFS Checks
define service {
hostgroup_name nagios-server
- use generic-service
+ use hadoop-service
service_description HDFS::Percent DataNodes storage full
check_command check_aggregate!"DATANODE::Storage full"!10%!30%
normal_check_interval 2
@@ -23,7 +31,7 @@ define service {
define service {
hostgroup_name nagios-server
- use generic-service
+ use hadoop-service
service_description HDFS::Percent DataNodes down
check_command check_aggregate!"DATANODE::Process down"!10%!30%
normal_check_interval 0.5
@@ -34,7 +42,7 @@ define service {
# NAGIOS SERVER MAPREDUCE Checks
define service {
hostgroup_name nagios-server
- use generic-service
+ use hadoop-service
service_description MAPREDUCE::Percent TaskTrackers down
check_command check_aggregate!"TASKTRACKER::Process down"!10%!30%
normal_check_interval 0.5
@@ -46,7 +54,7 @@ define service {
<%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%>
define service {
hostgroup_name nagios-server
- use generic-service
+ use hadoop-service
service_description ZOOKEEPER::Percent zookeeper servers down
check_command check_aggregate!"ZKSERVERS::ZKSERVERS Process down"!35%!70%
normal_check_interval 0.5
@@ -59,7 +67,7 @@ define service {
<%if scope.function_hdp_nagios_members_exist('hbasemaster')-%>
define service {
hostgroup_name nagios-server
- use generic-service
+ use hadoop-service
service_description HBASE::Percent region servers down
check_command check_aggregate!"REGIONSERVER::Process down"!10%!30%
normal_check_interval 0.5
@@ -73,7 +81,7 @@ define service {
<%if scope.function_hdp_nagios_members_exist('ganglia-server')-%>
define service {
hostgroup_name ganglia-server
- use generic-service
+ use hadoop-service
service_description GANGLIA::Ganglia [gmetad] Process down
check_command check_tcp!8651!-w 1 -c 1
normal_check_interval 0.25
@@ -83,7 +91,7 @@ define service {
define service {
hostgroup_name ganglia-server
- use generic-service
+ use hadoop-service
service_description GANGLIA::Ganglia collector [gmond] Process down alert for slaves
check_command check_tcp!8660!-w 1 -c 1
normal_check_interval 0.25
@@ -93,7 +101,7 @@ define service {
define service {
hostgroup_name ganglia-server
- use generic-service
+ use hadoop-service
service_description GANGLIA::Ganglia collector [gmond] Process down alert for namenode
check_command check_tcp!8661!-w 1 -c 1
normal_check_interval 0.25
@@ -103,7 +111,7 @@ define service {
define service {
hostgroup_name ganglia-server
- use generic-service
+ use hadoop-service
service_description GANGLIA::Ganglia collector [gmond] Process down alert for jobtracker
check_command check_tcp!8662!-w 1 -c 1
normal_check_interval 0.25
@@ -113,7 +121,7 @@ define service {
define service {
hostgroup_name ganglia-server
- use generic-service
+ use hadoop-service
service_description GANGLIA::Ganglia collector [gmond] Process down alert for hbasemaster
check_command check_tcp!8663!-w 1 -c 1
normal_check_interval 0.25
@@ -126,7 +134,7 @@ define service {
# Secondary namenode checks
define service {
hostgroup_name snamenode
- use generic-service
+ use hadoop-service
service_description NAMENODE::Secondary Namenode Process down
check_command check_tcp!50090!-w 1 -c 1
normal_check_interval 0.5
@@ -138,7 +146,7 @@ define service {
# HDFS Checks
define service {
hostgroup_name namenode
- use generic-service
+ use hadoop-service
service_description NAMENODE::Namenode Web UI down
check_command check_webui!namenode
normal_check_interval 1
@@ -148,7 +156,7 @@ define service {
define service {
hostgroup_name namenode
- use generic-service
+ use hadoop-service
service_description NAMENODE::Namenode Edit logs directory status
check_command check_name_dir_status!50070
normal_check_interval 0.5
@@ -158,7 +166,7 @@ define service {
define service {
hostgroup_name namenode
- use generic-service
+ use hadoop-service
service_description NAMENODE::Namenode Host CPU utilization
check_command check_cpu!200%!250%
normal_check_interval 5
@@ -168,7 +176,7 @@ define service {
define service {
hostgroup_name namenode
- use generic-service
+ use hadoop-service
service_description NAMENODE::Namenode Process down
check_command check_tcp!8020!-w 1 -c 1
normal_check_interval 0.5
@@ -178,7 +186,7 @@ define service {
define service {
hostgroup_name namenode
- use generic-service
+ use hadoop-service
service_description HDFS::Corrupt/Missing blocks
check_command check_hdfs_blocks!50070!0%!0%
normal_check_interval 2
@@ -188,7 +196,7 @@ define service {
define service {
hostgroup_name namenode
- use generic-service
+ use hadoop-service
service_description HDFS::HDFS Capacity utilization
check_command check_hdfs_capacity!50070!80%!90%
normal_check_interval 10
@@ -198,7 +206,7 @@ define service {
define service {
hostgroup_name namenode
- use generic-service
+ use hadoop-service
service_description HDFS::Namenode RPC Latency
check_command check_rpcq_latency!NameNode!50070!3000!5000
normal_check_interval 5
@@ -211,7 +219,7 @@ define service {
<%if scope.function_hdp_nagios_members_exist('jobtracker')-%>
define service {
hostgroup_name jobtracker
- use generic-service
+ use hadoop-service
service_description JOBTRACKER::JobTracker Web UI down
check_command check_webui!jobtracker
normal_check_interval 1
@@ -221,7 +229,7 @@ define service {
define service {
hostgroup_name jobtracker
- use generic-service
+ use hadoop-service
service_description JOBTRACKER::JobHistory Web UI down
check_command check_webui!jobhistory
normal_check_interval 1
@@ -231,7 +239,7 @@ define service {
define service {
hostgroup_name jobtracker
- use generic-service
+ use hadoop-service
service_description JOBTRACKER::Jobtracker CPU utilization
check_command check_cpu!200%!250%
normal_check_interval 5
@@ -241,7 +249,7 @@ define service {
define service {
hostgroup_name jobtracker
- use generic-service
+ use hadoop-service
service_description JOBTRACKER::Jobtracker Process down
check_command check_tcp!50030!-w 1 -c 1
normal_check_interval 0.5
@@ -251,7 +259,7 @@ define service {
define service {
hostgroup_name jobtracker
- use generic-service
+ use hadoop-service
service_description MAPREDUCE::JobTracker RPC Latency
check_command check_rpcq_latency!JobTracker!50030!3000!5000
normal_check_interval 5
@@ -264,33 +272,36 @@ define service {
<%if scope.function_hdp_nagios_members_exist('slaves')-%>
define service {
hostgroup_name slaves
- use generic-service
+ use hadoop-service
service_description DATANODE::Process down
check_command check_tcp!50010!-w 1 -c 1
normal_check_interval 1
retry_check_interval 0.5
max_check_attempts 3
+ notifications_enabled 0
}
define service {
hostgroup_name slaves
- use generic-service
+ use hadoop-service
service_description DATANODE::Storage full
check_command check_datanode_storage!50075!90%!90%
normal_check_interval 5
retry_check_interval 1
max_check_attempts 2
+ notifications_enabled 0
}
# MAPREDUCE::TASKTRACKER Checks
define service {
hostgroup_name slaves
- use generic-service
+ use hadoop-service
service_description TASKTRACKER::Process down
check_command check_tcp!50060!-w 1 -c 1
normal_check_interval 1
retry_check_interval 0.5
max_check_attempts 3
+ notifications_enabled 0
}
<%end-%>
@@ -298,12 +309,13 @@ define service {
# HBASE::REGIONSERVER Checks
define service {
hostgroup_name region-servers
- use generic-service
+ use hadoop-service
service_description REGIONSERVER::Process down
check_command check_tcp!60020!-w 1 -c 1
normal_check_interval 1
retry_check_interval 0.5
max_check_attempts 3
+ notifications_enabled 0
}
<%end-%>
@@ -311,7 +323,7 @@ define service {
# ZOOKEEPER Checks
define service {
hostgroup_name zookeeper-servers
- use generic-service
+ use hadoop-service
service_description ZKSERVERS::ZKSERVERS Process down
check_command check_tcp!2181!-w 1 -c 1
normal_check_interval 1
@@ -324,7 +336,7 @@ define service {
# HBASE Checks
define service {
hostgroup_name hbasemaster
- use generic-service
+ use hadoop-service
service_description HBASEMASTER::HBase Web UI down
check_command check_webui!hbase
normal_check_interval 1
@@ -334,7 +346,7 @@ define service {
define service {
hostgroup_name hbasemaster
- use generic-service
+ use hadoop-service
service_description HBASEMASTER::HBaseMaster CPU utilization
check_command check_cpu!200%!250%
normal_check_interval 5
@@ -344,7 +356,7 @@ define service {
define service {
hostgroup_name hbasemaster
- use generic-service
+ use hadoop-service
service_description HBASEMASTER::HBaseMaster Process down
check_command check_tcp!60000!-w 1 -c 1
normal_check_interval 0.5
@@ -357,7 +369,7 @@ define service {
# HIVE Metastore check
define service {
hostgroup_name hiveserver
- use generic-service
+ use hadoop-service
service_description HIVE-METASTORE::HIVE-METASTORE status check
check_command check_hive_metastore_status!9083
normal_check_interval 0.5
@@ -369,7 +381,7 @@ define service {
# Oozie check
define service {
hostgroup_name oozie-server
- use generic-service
+ use hadoop-service
service_description OOZIE::Oozie status check
check_command check_oozie_status!11000!<%=scope.function_hdp_template_var("java32_home") %>
normal_check_interval 1
@@ -381,7 +393,7 @@ define service {
# Templeton check
define service {
hostgroup_name templeton-server
- use generic-service
+ use hadoop-service
service_description TEMPLETON::Templeton status check
check_command check_templeton_status!50111!v1
normal_check_interval 1