You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by sw...@apache.org on 2013/08/09 23:46:41 UTC

git commit: AMBARI-2861. YARN RM/NM alerts need to be generated. (Vitaly Brodetskyi via swagle)

Updated Branches:
  refs/heads/trunk f4cc4c887 -> 4be888c57


AMBARI-2861. YARN RM/NM alerts need to be generated. (Vitaly Brodetskyi via swagle)


Project: http://git-wip-us.apache.org/repos/asf/incubator-ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ambari/commit/4be888c5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ambari/tree/4be888c5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ambari/diff/4be888c5

Branch: refs/heads/trunk
Commit: 4be888c577e71cef7ada6f68548bff436ac10d50
Parents: f4cc4c8
Author: Siddharth Wagle <sw...@hortonworks.com>
Authored: Fri Aug 9 14:46:31 2013 -0700
Committer: Siddharth Wagle <sw...@hortonworks.com>
Committed: Fri Aug 9 14:46:31 2013 -0700

----------------------------------------------------------------------
 .../files/check_nodemanager_health.sh           | 32 ++++++++++++++
 .../check_resourcemanager_nodes_percentage.sh   | 45 ++++++++++++++++++++
 .../hdp-nagios/manifests/server/config.pp       |  2 +
 .../templates/hadoop-commands.cfg.erb           | 10 +++++
 .../templates/hadoop-services.cfg.erb           | 32 ++++++++++++++
 5 files changed, 121 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
new file mode 100644
index 0000000..ca13909
--- /dev/null
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+HOST=$1
+PORT=$2
+NODEMANAGER_URL="http://$HOST:$PORT/ws/v1/node/info"
+export PATH="/usr/bin:$PATH"
+RESPONSE=`curl $NODEMANAGER_URL`
+if [[ "$RESPONSE" == *'"nodeHealthy":true'* ]]; then 
+  echo "OK: nodemanager healthy true";
+  exit 0;
+fi
+echo "CRITICAL: nodemanager healthy false";
+exit 2;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh
new file mode 100644
index 0000000..48a2aae
--- /dev/null
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+HOST=$1
+PORT=$2
+#Resource manager nodes, with selected status, which number we want to know
+NODE_STATUS=$3
+WARN_PERCENT=$4
+CRIT_PERCENT=$5
+NODES="Nodes"
+RESOURCEMANAGER_URL="http://$HOST:$PORT/ws/v1/cluster/metrics"
+export PATH="/usr/bin:$PATH"
+RESPONSE=`curl $RESOURCEMANAGER_URL`
+#code below is parsing RESPONSE that we get from resourcemanager api, for number between "totalNodes": and ','
+TOTAL_NODES_NUM=`echo "$RESPONSE" | sed -nre 's/^.*"totalNodes":([[:digit:]]+).*$/\1/gp'`
+NODES_NUM=`echo "$RESPONSE" | sed -nre "s/^.*\"$NODE_STATUS$NODES\":([[:digit:]]+).*$/\1/gp"`
+PERCENT=$(($NODES_NUM*100/$TOTAL_NODES_NUM))
+if [[ "$PERCENT" -lt "$WARN_PERCENT" ]]; then
+  echo "OK: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>"
+  exit 0;
+elif [[ "$PERCENT" -lt "$CRIT_PERCENT" ]]; then
+  echo "WARN: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>"
+  exit 1;
+else 
+  echo "CRITICAL: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>"
+  exit 2;
+fi

http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp
index 025bcd7..598a8f5 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp
@@ -50,6 +50,8 @@ class hdp-nagios::server::config()
   hdp-nagios::server::check { 'check_ambari_agent_status.sh': }
   hdp-nagios::server::check { 'check_hue_status.sh': }
   hdp-nagios::server::check { 'check_mapred_local_dir_used.sh': }
+  hdp-nagios::server::check { 'check_nodemanager_health.sh': }
+  hdp-nagios::server::check { 'check_resourcemanager_nodes_percentage.sh': }
 
   anchor{'hdp-nagios::server::config::begin':} -> Hdp-nagios::Server::Configfile<||> -> anchor{'hdp-nagios::server::config::end':}
   Anchor['hdp-nagios::server::config::begin'] -> Hdp-nagios::Server::Check<||> -> Anchor['hdp-nagios::server::config::end']

http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb
index 1233e18..4dbc398 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb
@@ -96,3 +96,13 @@ define command{
        command_name    check_mapred_local_dir_used_space
        command_line    $USER1$/check_mapred_local_dir_used.sh $ARG1$ $ARG2$
        }
+
+define command{
+        command_name    check_nodemanager_health
+        command_line    $USER1$/check_nodemanager_health.sh $HOSTADDRESS$ $ARG1$
+       }
+
+define command{
+        command_name    check_resourcemanager_nodes_percentage
+        command_line    $USER1$/check_resourcemanager_nodes_percentage.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$
+       }

http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
index 8e29808..401c79f 100644
--- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
+++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
@@ -435,6 +435,27 @@ define service {
         max_check_attempts      5
 }
 
+define service {
+        hostgroup_name          resourcemanager
+        use                     hadoop-service
+        service_description     RESOURCEMANAGER::Resource Manager percent nodemanager down
+        servicegroups           YARN
+        check_command           check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!lost!10!30
+        normal_check_interval   1
+        retry_check_interval    1
+        max_check_attempts      3
+}
+
+define service {
+        hostgroup_name          resourcemanager
+        use                     hadoop-service
+        service_description     RESOURCEMANAGER::Resource Manager percent nodemanager unhealthy
+        servicegroups           YARN
+        check_command           check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!unhealthy!10!30
+        normal_check_interval   1
+        retry_check_interval    1
+        max_check_attempts      3
+}
 <% end %>
 
 <%if scope.function_hdp_nagios_members_exist('nodemanagers')-%>
@@ -449,6 +470,17 @@ define service {
         retry_check_interval    0.5
         max_check_attempts      3
 }
+
+define service {
+        hostgroup_name          nodemanagers
+        use                     hadoop-service
+        service_description     NODEMANAGER::Node Manager unhealthy
+        servicegroups           YARN
+        check_command           check_nodemanager_health!<%=scope.function_hdp_template_var("nm_port")%>
+        normal_check_interval   1
+        retry_check_interval    1
+        max_check_attempts      3
+}
 <% end %>
 
 <%if scope.function_hdp_nagios_members_exist('historyserver2')-%>