You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by sw...@apache.org on 2013/08/09 23:46:41 UTC
git commit: AMBARI-2861. YARN RM/NM alerts need to be generated.
(Vitaly Brodetskyi via swagle)
Updated Branches:
refs/heads/trunk f4cc4c887 -> 4be888c57
AMBARI-2861. YARN RM/NM alerts need to be generated. (Vitaly Brodetskyi via swagle)
Project: http://git-wip-us.apache.org/repos/asf/incubator-ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ambari/commit/4be888c5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ambari/tree/4be888c5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ambari/diff/4be888c5
Branch: refs/heads/trunk
Commit: 4be888c577e71cef7ada6f68548bff436ac10d50
Parents: f4cc4c8
Author: Siddharth Wagle <sw...@hortonworks.com>
Authored: Fri Aug 9 14:46:31 2013 -0700
Committer: Siddharth Wagle <sw...@hortonworks.com>
Committed: Fri Aug 9 14:46:31 2013 -0700
----------------------------------------------------------------------
.../files/check_nodemanager_health.sh | 32 ++++++++++++++
.../check_resourcemanager_nodes_percentage.sh | 45 ++++++++++++++++++++
.../hdp-nagios/manifests/server/config.pp | 2 +
.../templates/hadoop-commands.cfg.erb | 10 +++++
.../templates/hadoop-services.cfg.erb | 32 ++++++++++++++
5 files changed, 121 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
new file mode 100644
index 0000000..ca13909
--- /dev/null
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+HOST=$1
+PORT=$2
+NODEMANAGER_URL="http://$HOST:$PORT/ws/v1/node/info"
+export PATH="/usr/bin:$PATH"
+RESPONSE=`curl $NODEMANAGER_URL`
+if [[ "$RESPONSE" == *'"nodeHealthy":true'* ]]; then
+ echo "OK: nodemanager healthy true";
+ exit 0;
+fi
+echo "CRITICAL: nodemanager healthy false";
+exit 2;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh
new file mode 100644
index 0000000..48a2aae
--- /dev/null
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+HOST=$1
+PORT=$2
+#Resource manager nodes, with selected status, which number we want to know
+NODE_STATUS=$3
+WARN_PERCENT=$4
+CRIT_PERCENT=$5
+NODES="Nodes"
+RESOURCEMANAGER_URL="http://$HOST:$PORT/ws/v1/cluster/metrics"
+export PATH="/usr/bin:$PATH"
+RESPONSE=`curl $RESOURCEMANAGER_URL`
+#code below is parsing RESPONSE that we get from resourcemanager api, for number between "totalNodes": and ','
+TOTAL_NODES_NUM=`echo "$RESPONSE" | sed -nre 's/^.*"totalNodes":([[:digit:]]+).*$/\1/gp'`
+NODES_NUM=`echo "$RESPONSE" | sed -nre "s/^.*\"$NODE_STATUS$NODES\":([[:digit:]]+).*$/\1/gp"`
+PERCENT=$(($NODES_NUM*100/$TOTAL_NODES_NUM))
+if [[ "$PERCENT" -lt "$WARN_PERCENT" ]]; then
+ echo "OK: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>"
+ exit 0;
+elif [[ "$PERCENT" -lt "$CRIT_PERCENT" ]]; then
+ echo "WARN: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>"
+ exit 1;
+else
+ echo "CRITICAL: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>"
+ exit 2;
+fi
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp
index 025bcd7..598a8f5 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp
@@ -50,6 +50,8 @@ class hdp-nagios::server::config()
hdp-nagios::server::check { 'check_ambari_agent_status.sh': }
hdp-nagios::server::check { 'check_hue_status.sh': }
hdp-nagios::server::check { 'check_mapred_local_dir_used.sh': }
+ hdp-nagios::server::check { 'check_nodemanager_health.sh': }
+ hdp-nagios::server::check { 'check_resourcemanager_nodes_percentage.sh': }
anchor{'hdp-nagios::server::config::begin':} -> Hdp-nagios::Server::Configfile<||> -> anchor{'hdp-nagios::server::config::end':}
Anchor['hdp-nagios::server::config::begin'] -> Hdp-nagios::Server::Check<||> -> Anchor['hdp-nagios::server::config::end']
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb
index 1233e18..4dbc398 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb
@@ -96,3 +96,13 @@ define command{
command_name check_mapred_local_dir_used_space
command_line $USER1$/check_mapred_local_dir_used.sh $ARG1$ $ARG2$
}
+
+define command{
+ command_name check_nodemanager_health
+ command_line $USER1$/check_nodemanager_health.sh $HOSTADDRESS$ $ARG1$
+ }
+
+define command{
+ command_name check_resourcemanager_nodes_percentage
+ command_line $USER1$/check_resourcemanager_nodes_percentage.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$
+ }
http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
index 8e29808..401c79f 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
@@ -435,6 +435,27 @@ define service {
max_check_attempts 5
}
+define service {
+ hostgroup_name resourcemanager
+ use hadoop-service
+ service_description RESOURCEMANAGER::Resource Manager percent nodemanager down
+ servicegroups YARN
+ check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!lost!10!30
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
+
+define service {
+ hostgroup_name resourcemanager
+ use hadoop-service
+ service_description RESOURCEMANAGER::Resource Manager percent nodemanager unhealthy
+ servicegroups YARN
+ check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!unhealthy!10!30
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
<% end %>
<%if scope.function_hdp_nagios_members_exist('nodemanagers')-%>
@@ -449,6 +470,17 @@ define service {
retry_check_interval 0.5
max_check_attempts 3
}
+
+define service {
+ hostgroup_name nodemanagers
+ use hadoop-service
+ service_description NODEMANAGER::Node Manager unhealthy
+ servicegroups YARN
+ check_command check_nodemanager_health!<%=scope.function_hdp_template_var("nm_port")%>
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
<% end %>
<%if scope.function_hdp_nagios_members_exist('historyserver2')-%>