You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by yu...@apache.org on 2013/08/16 00:22:19 UTC
git commit: AMBARI-2920. Rename alert titles and descriptions.
(yusaku)"
Updated Branches:
refs/heads/trunk d44c1c2ba -> a2b675c6a
AMBARI-2920. Rename alert titles and descriptions. (yusaku)"
Project: http://git-wip-us.apache.org/repos/asf/incubator-ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ambari/commit/a2b675c6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ambari/tree/a2b675c6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ambari/diff/a2b675c6
Branch: refs/heads/trunk
Commit: a2b675c6ada585a830ad928f6aefe426acb9dc03
Parents: d44c1c2
Author: Yusaku Sako <yu...@hortonworks.com>
Authored: Thu Aug 15 14:59:48 2013 -0700
Committer: Yusaku Sako <yu...@hortonworks.com>
Committed: Thu Aug 15 15:22:02 2013 -0700
----------------------------------------------------------------------
.../files/check_ambari_agent_status.sh | 4 +-
.../files/check_hive_metastore_status.sh | 4 +-
.../files/check_mapred_local_dir_used.sh | 4 +-
.../hdp-nagios/files/check_name_dir_status.php | 6 +-
.../files/check_nodemanager_health.sh | 4 +-
.../hdp-nagios/files/check_oozie_status.sh | 4 +-
.../hdp-nagios/files/check_templeton_status.sh | 4 +-
.../modules/hdp-nagios/files/check_webui.sh | 10 +-
.../templates/hadoop-services.cfg.erb | 96 ++++++++++----------
9 files changed, 68 insertions(+), 68 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh
index dd67496..a8b510a 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh
@@ -31,9 +31,9 @@ fi
if [ $RES -eq "2" ]
then
- echo "OK: Ambari agent is running [PID:$AMBARI_AGENT_PID]";
+ echo "OK: Ambari Agent is running [PID:$AMBARI_AGENT_PID]";
exit 0;
else
- echo "CRITICAL: Ambari agent is not running [$AMBARI_AGENT_PID_PATH not found]";
+ echo "CRITICAL: Ambari Agent is not running [$AMBARI_AGENT_PID_PATH not found]";
exit 2;
fi
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh
index 0140958..640c077 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh
@@ -38,8 +38,8 @@ HCAT_URL=-Dhive.metastore.uris="thrift://$HOST:$PORT"
export JAVA_HOME=$JAVA_HOME
out=`hcat $HCAT_URL -e "show databases" 2>&1`
if [[ "$?" -ne 0 ]]; then
- echo "CRITICAL: Error accessing hive-metaserver status [$out]";
+ echo "CRITICAL: Error accessing Hive Metastore status [$out]";
exit 2;
fi
-echo "OK: Hive metaserver status OK";
+echo "OK: Hive Metastore status OK";
exit 0;
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh
index e91cb66..15c85eb 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh
@@ -26,9 +26,9 @@ for mapred_dir in $MAPRED_LOCAL_DIRS
do
percent=`df -hl $mapred_dir | awk '{percent=$5;} END{print percent}' | cut -d % -f 1`
if [ $percent -ge $CRITICAL ]; then
- echo "CRITICAL: Mapreduce local dir is full."
+ echo "CRITICAL: MapReduce local dir is full."
exit 2
fi
done
-echo "OK: Mapreduce local dir space is available."
+echo "OK: MapReduce local dir space is available."
exit 0
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php
index 3f38c98..db2b491 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php
@@ -36,12 +36,12 @@
$json_array = json_decode($json_string, true);
$object = $json_array['beans'][0];
if ($object['NameDirStatuses'] == "") {
- echo "WARNING: Namenode directory status not available via http://".$host.":".$port."/jmx url" . "\n";
+ echo "WARNING: NameNode directory status not available via http://".$host.":".$port."/jmx url" . "\n";
exit(1);
}
$NameDirStatuses = json_decode($object['NameDirStatuses'], true);
$failed_dir_count = count($NameDirStatuses['failed']);
- $out_msg = "CRITICAL: Offline Namenode directories: ";
+ $out_msg = "CRITICAL: Offline NameNode directories: ";
if ($failed_dir_count > 0) {
foreach ($NameDirStatuses['failed'] as $key => $value) {
$out_msg = $out_msg . $key . ":" . $value . ", ";
@@ -49,7 +49,7 @@
echo $out_msg . "\n";
exit (2);
}
- echo "OK: All Namenode directories are active" . "\n";
+ echo "OK: All NameNode directories are active" . "\n";
exit(0);
/* print usage */
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
index 2a26f4e..82b8a3d 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
@@ -25,8 +25,8 @@ NODEMANAGER_URL="http://$HOST:$PORT/ws/v1/node/info"
export PATH="/usr/bin:$PATH"
RESPONSE=`curl -s $NODEMANAGER_URL`
if [[ "$RESPONSE" == *'"nodeHealthy":true'* ]]; then
- echo "OK: nodemanager healthy true";
+ echo "OK: NodeManager healthy";
exit 0;
fi
-echo "CRITICAL: nodemanager healthy false";
+echo "CRITICAL: NodeManager unhealthy";
exit 2;
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh
index bfd9d75..820ee99 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh
@@ -38,8 +38,8 @@ OOZIE_URL="http://$HOST:$PORT/oozie"
export JAVA_HOME=$JAVA_HOME
out=`oozie admin -oozie ${OOZIE_URL} -status 2>&1`
if [[ "$?" -ne 0 ]]; then
- echo "CRITICAL: Error accessing oozie server status [$out]";
+ echo "CRITICAL: Error accessing Oozie Server status [$out]";
exit 2;
fi
-echo "OK: Oozie server status [$out]";
+echo "OK: Oozie Server status [$out]";
exit 0;
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh
index 7190956..79424be 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh
@@ -37,8 +37,8 @@ fi
regex="^.*\"status\":\"ok\".*<status_code:200>$"
out=`curl --negotiate -u : -s -w '<status_code:%{http_code}>' http://$HOST:$PORT/templeton/$VERSION/status 2>&1`
if [[ $out =~ $regex ]]; then
- echo "OK: Templeton server status [$out]";
+ echo "OK: WebHCat Server status [$out]";
exit 0;
fi
-echo "CRITICAL: Error accessing Templeton server, status [$out]";
+echo "CRITICAL: Error accessing WebHCat Server, status [$out]";
exit 2;
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
index 57381e4..b23045e 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
@@ -46,35 +46,35 @@ jobtracker)
namenode)
nnweburl="http://$host:$port"
if [[ `checkurl "$nnweburl"` -ne 0 ]] ; then
- echo "WARNING: NameNode web UI not accessible : $nnweburl";
+ echo "WARNING: NameNode Web UI not accessible : $nnweburl";
exit 1;
fi
;;
jobhistory)
jhweburl="http://$host:$port/jobhistoryhome.jsp"
if [[ `checkurl "$jhweburl"` -ne 0 ]]; then
- echo "WARNING: Jobhistory web UI not accessible : $jhweburl";
+ echo "WARNING: HistoryServer Web UI not accessible : $jhweburl";
exit 1;
fi
;;
hbase)
hbaseweburl="http://$host:$port/master-status"
if [[ `checkurl "$hbaseweburl"` -ne 0 ]]; then
- echo "WARNING: HBase Master web UI not accessible : $hbaseweburl";
+ echo "WARNING: HBase Master Web UI not accessible : $hbaseweburl";
exit 1;
fi
;;
resourcemanager)
rmweburl="http://$host:$port/cluster"
if [[ `checkurl "$rmweburl"` -ne 0 ]]; then
- echo "WARNING: ResourceManager web UI not accessible : $rmweburl";
+ echo "WARNING: ResourceManager Web UI not accessible : $rmweburl";
exit 1;
fi
;;
historyserver2)
hsweburl="http://$host:$port/jobhistory"
if [[ `checkurl "$hsweburl"` -ne 0 ]]; then
- echo "WARNING: HistoryServer2 web UI not accessible : $hsweburl";
+ echo "WARNING: HistoryServer Web UI not accessible : $hsweburl";
exit 1;
fi
;;
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
index d35b982..3256eb4 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
@@ -33,7 +33,7 @@ define service {
define service {
hostgroup_name nagios-server
use hadoop-service
- service_description NAGIOS::Nagios status log staleness
+ service_description NAGIOS::Nagios status log freshness
servicegroups NAGIOS
check_command check_nagios!10!/var/nagios/status.dat!<%=scope.function_hdp_template_var("::hdp-nagios::server::config::nagios_lookup_daemon_str")%>
normal_check_interval 5
@@ -45,9 +45,9 @@ define service {
define service {
hostgroup_name nagios-server
use hadoop-service
- service_description HDFS::Percent DataNodes storage full
+ service_description HDFS::Percent DataNodes with space available
servicegroups HDFS
- check_command check_aggregate!"DATANODE::DataNode storage full"!10%!30%
+ check_command check_aggregate!"DATANODE::DataNode space"!10%!30%
normal_check_interval 2
retry_check_interval 1
max_check_attempts 1
@@ -56,9 +56,9 @@ define service {
define service {
hostgroup_name nagios-server
use hadoop-service
- service_description HDFS::Percent DataNodes down
+ service_description HDFS::Percent DataNodes live
servicegroups HDFS
- check_command check_aggregate!"DATANODE::DataNode process down"!10%!30%
+ check_command check_aggregate!"DATANODE::DataNode process"!10%!30%
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
@@ -69,7 +69,7 @@ define service {
define service {
hostgroup_name agent-servers
use hadoop-service
- service_description AMBARI::Check ambari-agent process
+ service_description AMBARI::Ambari Agent process
servicegroups AMBARI
check_command check_ambari_agent_status
normal_check_interval 5
@@ -82,9 +82,9 @@ define service {
define service {
hostgroup_name nagios-server
use hadoop-service
- service_description ZOOKEEPER::Percent ZooKeeper Servers down
+ service_description ZOOKEEPER::Percent ZooKeeper Servers live
servicegroups ZOOKEEPER
- check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process down"!35%!70%
+ check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process"!35%!70%
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
@@ -96,9 +96,9 @@ define service {
define service {
hostgroup_name nagios-server
use hadoop-service
- service_description HBASE::Percent RegionServers down
+ service_description HBASE::Percent RegionServers live
servicegroups HBASE
- check_command check_aggregate!"REGIONSERVER::RegionServer process down"!10%!30%
+ check_command check_aggregate!"REGIONSERVER::RegionServer process"!10%!30%
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
@@ -113,7 +113,7 @@ define service {
define service {
hostgroup_name ganglia-server
use hadoop-service
- service_description GANGLIA::Ganglia [gmetad] process down
+ service_description GANGLIA::Ganglia Server process
servicegroups GANGLIA
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_port")%>!-w 1 -c 1
normal_check_interval 0.25
@@ -124,7 +124,7 @@ define service {
define service {
hostgroup_name ganglia-server
use hadoop-service
- service_description GANGLIA::Ganglia Collector [gmond] process down alert for slaves
+ service_description GANGLIA::Ganglia Monitor process for Slaves
servicegroups GANGLIA
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_slaves_port")%>!-w 1 -c 1
normal_check_interval 0.25
@@ -135,7 +135,7 @@ define service {
define service {
hostgroup_name ganglia-server
use hadoop-service
- service_description GANGLIA::Ganglia Collector [gmond] process down alert for NameNode
+ service_description GANGLIA::Ganglia Monitor process for NameNode
servicegroups GANGLIA
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_namenode_port")%>!-w 1 -c 1
normal_check_interval 0.25
@@ -147,7 +147,7 @@ define service {
define service {
hostgroup_name ganglia-server
use hadoop-service
- service_description GANGLIA::Ganglia Collector [gmond] process down alert for JobTracker
+ service_description GANGLIA::Ganglia Monitor process for JobTracker
servicegroups GANGLIA
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_jobtracker_port")%>!-w 1 -c 1
normal_check_interval 0.25
@@ -160,7 +160,7 @@ define service {
define service {
hostgroup_name ganglia-server
use hadoop-service
- service_description GANGLIA::Ganglia Collector [gmond] process down alert for HBase Master
+ service_description GANGLIA::Ganglia Monitor process for HBase Master
servicegroups GANGLIA
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_hbase_port")%>!-w 1 -c 1
normal_check_interval 0.25
@@ -173,7 +173,7 @@ define service {
define service {
hostgroup_name ganglia-server
use hadoop-service
- service_description GANGLIA::Ganglia Collector [gmond] process down alert for ResourceManager
+ service_description GANGLIA::Ganglia Monitor process for ResourceManager
servicegroups GANGLIA
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_rm_port")%>!-w 1 -c 1
normal_check_interval 0.25
@@ -186,7 +186,7 @@ define service {
define service {
hostgroup_name ganglia-server
use hadoop-service
- service_description GANGLIA::Ganglia Collector [gmond] process down alert for NodeManager
+ service_description GANGLIA::Ganglia Monitor process for NodeManager
servicegroups GANGLIA
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_nm_port")%>!-w 1 -c 1
normal_check_interval 0.25
@@ -199,7 +199,7 @@ define service {
define service {
hostgroup_name ganglia-server
use hadoop-service
- service_description GANGLIA::Ganglia Collector [gmond] process down alert for History Server 2
+ service_description GANGLIA::Ganglia Monitor process for HistoryServer
servicegroups GANGLIA
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_hs_port")%>!-w 1 -c 1
normal_check_interval 0.25
@@ -215,7 +215,7 @@ define service {
define service {
hostgroup_name snamenode
use hadoop-service
- service_description NAMENODE::Secondary NameNode process down
+ service_description NAMENODE::Secondary NameNode process
servicegroups HDFS
check_command check_tcp!<%=scope.function_hdp_template_var("snamenode_port")%>!-w 1 -c 1
normal_check_interval 0.5
@@ -228,7 +228,7 @@ define service {
define service {
hostgroup_name namenode
use hadoop-service
- service_description NAMENODE::NameNode Web UI down
+ service_description NAMENODE::NameNode Web UI
servicegroups HDFS
check_command check_webui!namenode!<%=scope.function_hdp_template_var("::hdp::namenode_port")%>
normal_check_interval 1
@@ -262,7 +262,7 @@ define service {
define service {
hostgroup_name namenode
use hadoop-service
- service_description NAMENODE::NameNode process down
+ service_description NAMENODE::NameNode process
servicegroups HDFS
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::namenode_metadata_port")%>!-w 1 -c 1
normal_check_interval 0.5
@@ -273,7 +273,7 @@ define service {
define service {
hostgroup_name namenode
use hadoop-service
- service_description HDFS::Corrupt/Missing blocks
+ service_description HDFS::Blocks health
servicegroups HDFS
check_command check_hdfs_blocks!<%=scope.function_hdp_template_var("::hdp::namenode_port")%>!0%!0%
normal_check_interval 2
@@ -309,7 +309,7 @@ define service {
define service {
hostgroup_name jobtracker
use hadoop-service
- service_description JOBTRACKER::JobTracker Web UI down
+ service_description JOBTRACKER::JobTracker Web UI
servicegroups MAPREDUCE
check_command check_webui!jobtracker!<%=scope.function_hdp_template_var("::hdp::jtnode_port")%>
normal_check_interval 1
@@ -320,7 +320,7 @@ define service {
define service {
hostgroup_name jobtracker
use hadoop-service
- service_description JOBTRACKER::JobHistory Web UI down
+ service_description JOBTRACKER::HistoryServer Web UI
servicegroups MAPREDUCE
check_command check_webui!jobhistory!<%=scope.function_hdp_template_var("::hdp::jobhistory_port")%>
normal_check_interval 1
@@ -344,7 +344,7 @@ define service {
hostgroup_name jobtracker
use hadoop-service
use hadoop-service
- service_description JOBTRACKER::JobTracker process down
+ service_description JOBTRACKER::JobTracker process
servicegroups MAPREDUCE
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::jtnode_port")%>!-w 1 -c 1
normal_check_interval 0.5
@@ -369,9 +369,9 @@ define service {
define service {
hostgroup_name nagios-server
use hadoop-service
- service_description MAPREDUCE::Percent TaskTrackers down
+ service_description MAPREDUCE::Percent TaskTrackers live
servicegroups MAPREDUCE
- check_command check_aggregate!"TASKTRACKER::TaskTracker process down"!10%!30%
+ check_command check_aggregate!"TASKTRACKER::TaskTracker process"!10%!30%
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
@@ -381,7 +381,7 @@ define service {
define service {
hostgroup_name tasktracker-servers
use hadoop-service
- service_description TASKTRACKER::TaskTracker process down
+ service_description TASKTRACKER::TaskTracker process
servicegroups MAPREDUCE
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::tasktracker_port")%>!-w 1 -c 1
normal_check_interval 1
@@ -393,7 +393,7 @@ define service {
define service {
hostgroup_name tasktracker-servers
use hadoop-service
- service_description TASKTRACKER::Mapreduce local dir used space
+ service_description TASKTRACKER::MapReduce local dir space
servicegroups MAPREDUCE
check_command check_mapred_local_dir_used_space!<%=scope.function_hdp_default("::hdp::mapred-site/mapred.local.dir")%>!85%
normal_check_interval 0.5
@@ -409,7 +409,7 @@ define service {
define service {
hostgroup_name resourcemanager
use hadoop-service
- service_description RESOURCEMANAGER::ResourceManager Web UI down
+ service_description RESOURCEMANAGER::ResourceManager Web UI
servicegroups YARN
check_command check_webui!resourcemanager!<%=scope.function_hdp_template_var("::hdp::rm_port")%>
normal_check_interval 1
@@ -442,7 +442,7 @@ define service {
define service {
hostgroup_name resourcemanager
use hadoop-service
- service_description RESOURCEMANAGER::Percent NodeManager down
+ service_description RESOURCEMANAGER::Percent NodeManager live
servicegroups YARN
check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!lost!10!30
normal_check_interval 1
@@ -453,7 +453,7 @@ define service {
define service {
hostgroup_name resourcemanager
use hadoop-service
- service_description RESOURCEMANAGER::Percent NodeManager unhealthy
+ service_description RESOURCEMANAGER::Percent NodeManager healthy
servicegroups YARN
check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!unhealthy!10!30
normal_check_interval 1
@@ -467,7 +467,7 @@ define service {
define service {
hostgroup_name nodemanagers
use hadoop-service
- service_description NODEMANAGER::NodeManager process down
+ service_description NODEMANAGER::NodeManager process
servicegroups YARN
check_command check_tcp!<%=scope.function_hdp_template_var("nm_port")%>!-w 1 -c 1
normal_check_interval 1
@@ -478,7 +478,7 @@ define service {
define service {
hostgroup_name nodemanagers
use hadoop-service
- service_description NODEMANAGER::NodeManager unhealthy
+ service_description NODEMANAGER::NodeManager health
servicegroups YARN
check_command check_nodemanager_health!<%=scope.function_hdp_template_var("nm_port")%>
normal_check_interval 1
@@ -492,7 +492,7 @@ define service {
define service {
hostgroup_name historyserver2
use hadoop-service
- service_description JOBHISTORY::History Server 2 Web UI down
+ service_description JOBHISTORY::HistoryServer Web UI
servicegroups MAPREDUCE
check_command check_webui!historyserver2!<%=scope.function_hdp_template_var("::hdp::hs_port")%>
normal_check_interval 1
@@ -503,7 +503,7 @@ define service {
define service {
hostgroup_name historyserver2
use hadoop-service
- service_description JOBHISTORY::History Server 2 CPU utilization
+ service_description JOBHISTORY::HistoryServer CPU utilization
servicegroups MAPREDUCE
check_command check_cpu!200%!250%
normal_check_interval 5
@@ -514,7 +514,7 @@ define service {
define service {
hostgroup_name historyserver2
use hadoop-service
- service_description JOBHISTORY::History Server 2 RPC latency
+ service_description JOBHISTORY::HistoryServer RPC latency
servicegroups MAPREDUCE
check_command check_rpcq_latency!JobHistoryServer!<%=scope.function_hdp_template_var("::hdp::hs_port")%>!3000!5000
normal_check_interval 5
@@ -529,7 +529,7 @@ define service {
define service {
hostgroup_name slaves
use hadoop-service
- service_description DATANODE::DataNode process down
+ service_description DATANODE::DataNode process
servicegroups HDFS
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::datanode_port")%>!-w 1 -c 1
normal_check_interval 1
@@ -540,7 +540,7 @@ define service {
define service {
hostgroup_name slaves
use hadoop-service
- service_description DATANODE::DataNode storage full
+ service_description DATANODE::DataNode space
servicegroups HDFS
check_command check_datanode_storage!<%=scope.function_hdp_template_var("::hdp::datanode_port")%>!90%!90%
normal_check_interval 5
@@ -555,7 +555,7 @@ define service {
define service {
hostgroup_name flume-servers
use hadoop-service
- service_description FLUME::Flume Agent process down
+ service_description FLUME::Flume Agent process
servicegroups FLUME
check_command check_tcp!<%=scope.function_hdp_template_var("flume_port")%>!-w 1 -c 1
normal_check_interval 1
@@ -570,7 +570,7 @@ define service {
define service {
hostgroup_name zookeeper-servers
use hadoop-service
- service_description ZOOKEEPER::ZooKeeper Server process down
+ service_description ZOOKEEPER::ZooKeeper Server process
servicegroups ZOOKEEPER
check_command check_tcp!<%=scope.function_hdp_template_var("::clientPort")%>!-w 1 -c 1
normal_check_interval 1
@@ -584,7 +584,7 @@ define service {
define service {
hostgroup_name region-servers
use hadoop-service
- service_description REGIONSERVER::RegionServer process down
+ service_description REGIONSERVER::RegionServer process
servicegroups HBASE
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::hbase_rs_port")%>!-w 1 -c 1
normal_check_interval 1
@@ -596,7 +596,7 @@ define service {
define service {
hostgroup_name hbasemasters
use hadoop-service
- service_description HBASEMASTER::HBase Master Web UI down
+ service_description HBASEMASTER::HBase Master Web UI
servicegroups HBASE
check_command check_webui!hbase!<%=scope.function_hdp_template_var("::hdp::hbase_master_port")%>
normal_check_interval 1
@@ -618,7 +618,7 @@ define service {
define service {
hostgroup_name hbasemasters
use hadoop-service
- service_description HBASEMASTER::HBase Master process down
+ service_description HBASEMASTER::HBase Master process
servicegroups HBASE
check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::hbase_master_port")%>!-w 1 -c 1
normal_check_interval 0.5
@@ -632,7 +632,7 @@ define service {
define service {
hostgroup_name hiveserver
use hadoop-service
- service_description HIVE-METASTORE::Hive Metastore status check
+ service_description HIVE-METASTORE::Hive Metastore status
servicegroups HIVE-METASTORE
<%if scope.function_hdp_template_var("security_enabled")-%>
check_command check_hive_metastore_status!<%=scope.function_hdp_template_var("::hive_metastore_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%>
@@ -649,7 +649,7 @@ define service {
define service {
hostgroup_name oozie-server
use hadoop-service
- service_description OOZIE::Oozie Server status check
+ service_description OOZIE::Oozie Server status
servicegroups OOZIE
<%if scope.function_hdp_template_var("security_enabled")-%>
check_command check_oozie_status!<%=scope.function_hdp_template_var("::hdp::oozie_server_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%>
@@ -666,7 +666,7 @@ define service {
define service {
hostgroup_name webhcat-server
use hadoop-service
- service_description WEBHCAT::WebHCat Server status check
+ service_description WEBHCAT::WebHCat Server status
servicegroups WEBHCAT
<%if scope.function_hdp_template_var("security_enabled")-%>
check_command check_templeton_status!<%=scope.function_hdp_template_var("::hdp::templeton_port")%>!v1!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%>
@@ -683,7 +683,7 @@ define service {
define service {
hostgroup_name hue-server
use hadoop-service
- service_description HUE::Hue Server status check
+ service_description HUE::Hue Server status
servicegroups HUE
check_command check_hue_status
normal_check_interval 100