You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ao...@apache.org on 2013/12/13 17:27:22 UTC
[2/3] AMBARI-4064. Nagios on HDP2. Using resource management lib
(aonishuk)
http://git-wip-us.apache.org/repos/asf/ambari/blob/65aec661/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/scripts/params.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/scripts/params.py b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/scripts/params.py
new file mode 100644
index 0000000..17111cc
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/scripts/params.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Ambari Agent
+
+"""
+
+from resource_management import *
+from functions import get_port_from_url
+
+# server configurations
+config = Script.get_config()
+
+conf_dir = "/etc/nagios"
+nagios_var_dir = "/var/nagios"
+nagios_rw_dir = "/var/nagios/rw"
+plugins_dir = "/usr/lib64/nagios/plugins"
+nagios_obj_dir = "/etc/nagios/objects"
+check_result_path = "/var/nagios/spool/checkresults"
+nagios_httpd_config_file = format("/etc/httpd/conf.d/nagios.conf")
+nagios_pid_dir = "/var/run/nagios"
+pid_file = format("{nagios_pid_dir}/nagios.pid")
+nagios_log_dir = "/var/log/nagios"
+nagios_log_archives_dir = format("{nagios_log_dir}/archives")
+nagios_host_cfg = format("{nagios_obj_dir}/hadoop-hosts.cfg")
+nagios_lookup_daemon_str = "/usr/sbin/nagios"
+nagios_pid_dir = "/var/run/nagios"
+nagios_pid_file = format("{nagios_pid_dir}/nagios.pid")
+nagios_resource_cfg = format("{conf_dir}/resource.cfg")
+nagios_hostgroup_cfg = format("{nagios_obj_dir}/hadoop-hostgroups.cfg")
+nagios_servicegroup_cfg = format("{nagios_obj_dir}/hadoop-servicegroups.cfg")
+nagios_service_cfg = format("{nagios_obj_dir}/hadoop-services.cfg")
+nagios_command_cfg = format("{nagios_obj_dir}/hadoop-commands.cfg")
+eventhandlers_dir = "/usr/lib/nagios/eventhandlers"
+nagios_principal_name = "nagios"
+hadoop_ssl_enabled = False
+
+namenode_metadata_port = "8020"
+oozie_server_port = "11000"
+# different to HDP1
+namenode_port = get_port_from_url(config['configurations']['hdfs-site']['dfs.namenode.http-address'])
+# different to HDP1
+snamenode_port = get_port_from_url(config['configurations']['hdfs-site']["dfs.namenode.secondary.http-address"])
+
+hbase_master_rpc_port = "60000"
+rm_port = get_port_from_url(config['configurations']['yarn-site']['yarn.resourcemanager.webapp.address'])
+nm_port = "8042"
+hs_port = get_port_from_url(config['configurations']['mapred-site']['mapreduce.jobhistory.webapp.address'])
+journalnode_port = get_port_from_url(config['configurations']['hdfs-site']['dfs.journalnode.http-address'])
+datanode_port = get_port_from_url(config['configurations']['hdfs-site']['dfs.datanode.http.address'])
+flume_port = "4159"
+hive_metastore_port = config['configurations']['global']['hive_metastore_port'] #"9083"
+templeton_port = config['configurations']['webhcat-site']['templeton.port'] #"50111"
+
+# this is different for HDP1
+nn_metrics_property = "FSNamesystem"
+clientPort = config['configurations']['global']['clientPort'] #ZK
+
+
+java64_home = config['configurations']['global']['java64_home']
+security_enabled = config['configurations']['global']['security_enabled']
+
+nagios_keytab_path = default("nagios_keytab_path", "/etc/security/keytabs/nagios.service.keytab")
+kinit_path_local = get_kinit_path([default("kinit_path_local",None), "/usr/bin", "/usr/kerberos/bin", "/usr/sbin"])
+
+dfs_ha_enabled = False
+dfs_ha_nameservices = default("/configurations/hdfs-site/dfs.nameservices", None)
+dfs_ha_namenode_ids = default(format("hdfs-site/dfs.ha.namenodes.{dfs_ha_nameservices}"), None)
+if dfs_ha_namenode_ids:
+ dfs_ha_namenode_ids_array_len = len(dfs_ha_namenode_ids.split(","))
+ if dfs_ha_namenode_ids_array_len > 1:
+ dfs_ha_enabled = True
+
+ganglia_port = "8651"
+ganglia_collector_slaves_port = "8660"
+ganglia_collector_namenode_port = "8661"
+ganglia_collector_jobtracker_port = "8662"
+ganglia_collector_hbase_port = "8663"
+ganglia_collector_rm_port = "8664"
+ganglia_collector_nm_port = "8660"
+ganglia_collector_hs_port = "8666"
+
+all_ping_ports = config['clusterHostInfo']['all_ping_ports']
+
+if System.get_instance().platform == "suse":
+ nagios_p1_pl = "/usr/lib/nagios/p1.pl"
+ htpasswd_cmd = "htpasswd2"
+else:
+ nagios_p1_pl = "/usr/bin/p1.pl"
+ htpasswd_cmd = "htpasswd"
+
+nagios_user = config['configurations']['global']['nagios_user']
+nagios_group = config['configurations']['global']['nagios_group']
+nagios_web_login = config['configurations']['global']['nagios_web_login']
+nagios_web_password = config['configurations']['global']['nagios_web_password']
+user_group = config['configurations']['global']['user_group']
+nagios_contact = config['configurations']['global']['nagios_contact']
+
+namenode_host = default("/clusterHostInfo/namenode_host", None)
+_snamenode_host = default("/clusterHostInfo/snamenode_host", None)
+_jtnode_host = default("/clusterHostInfo/jtnode_host", None)
+_slave_hosts = default("/clusterHostInfo/slave_hosts", None)
+_journalnode_hosts = default("/clusterHostInfo/journalnode_hosts", None)
+_zkfc_hosts = default("/clusterHostInfo/zkfc_hosts", None)
+_rm_host = default("/clusterHostInfo/rm_host", None)
+_nm_hosts = default("/clusterHostInfo/nm_hosts", None)
+_hs_host = default("/clusterHostInfo/hs_host", None)
+_zookeeper_hosts = default("/clusterHostInfo/zookeeper_hosts", None)
+_flume_hosts = default("/clusterHostInfo/flume_hosts", None)
+_nagios_server_host = default("/clusterHostInfo/nagios_server_host",None)
+_ganglia_server_host = default("/clusterHostInfo/ganglia_server_host",None)
+
+_hbase_master_hosts = default("/clusterHostInfo/hbase_master_hosts",None)
+_hive_server_host = default("/clusterHostInfo/hive_server_host",None)
+_oozie_server = default("/clusterHostInfo/oozie_server",None)
+_webhcat_server_host = default("/clusterHostInfo/webhcat_server_host",None)
+# can differ on HDP1
+#_mapred_tt_hosts = _slave_hosts
+#if hbase_rs_hosts not given it is assumed that region servers on same nodes as slaves
+_hbase_rs_hosts = default("/clusterHostInfo/hbase_rs_hosts", _slave_hosts)
+_hue_server_host = default("/clusterHostInfo/hue_server_host", None)
+all_hosts = config['clusterHostInfo']['all_hosts']
+
+
+hostgroup_defs = {
+ 'namenode' : namenode_host,
+ 'snamenode' : _snamenode_host,
+ 'slaves' : _slave_hosts,
+ # HDP1
+ #'tasktracker-servers' : _mapred_tt_hosts,
+ 'agent-servers' : all_hosts,
+ 'nagios-server' : _nagios_server_host,
+ 'jobtracker' : _jtnode_host,
+ 'ganglia-server' : _ganglia_server_host,
+ 'flume-servers' : _flume_hosts,
+ 'zookeeper-servers' : _zookeeper_hosts,
+ 'hbasemasters' : _hbase_master_hosts,
+ 'hiveserver' : _hive_server_host,
+ 'region-servers' : _hbase_rs_hosts,
+ 'oozie-server' : _oozie_server,
+ 'webhcat-server' : _webhcat_server_host,
+ 'hue-server' : _hue_server_host,
+ 'resourcemanager' : _rm_host,
+ 'nodemanagers' : _nm_hosts,
+ 'historyserver2' : _hs_host,
+ 'journalnodes' : _journalnode_hosts
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/ambari/blob/65aec661/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/contacts.cfg.j2
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/contacts.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/contacts.cfg.j2
new file mode 100644
index 0000000..9dada51
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/contacts.cfg.j2
@@ -0,0 +1,91 @@
+###############################################################################
+# CONTACTS.CFG - SAMPLE CONTACT/CONTACTGROUP DEFINITIONS
+#
+# Last Modified: 05-31-2007
+#
+# NOTES: This config file provides you with some example contact and contact
+# group definitions that you can reference in host and service
+# definitions.
+#
+# You don't need to keep these definitions in a separate file from your
+# other object definitions. This has been done just to make things
+# easier to understand.
+#
+###############################################################################
+
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+#
+
+
+###############################################################################
+###############################################################################
+#
+# CONTACTS
+#
+###############################################################################
+###############################################################################
+
+# Just one contact defined by default - the Nagios admin (that's you)
+# This contact definition inherits a lot of default values from the 'generic-contact'
+# template which is defined elsewhere.
+
+define contact{
+ contact_name {{nagios_web_login}} ; Short name of user
+ use generic-contact ; Inherit default values from generic-contact template (defined above)
+ alias Nagios Admin ; Full name of user
+
+ email {{nagios_contact}} ; <<***** CHANGE THIS TO YOUR EMAIL ADDRESS ******
+ }
+
+# Contact which writes all Nagios alerts to the system logger.
+define contact{
+ contact_name sys_logger ; Short name of user
+ use generic-contact ; Inherit default values from generic-contact template (defined above)
+ alias System Logger ; Full name of user
+ host_notifications_enabled 1
+ service_notifications_enabled 1
+ service_notification_period 24x7
+ host_notification_period 24x7
+ service_notification_options w,u,c,r,s
+ host_notification_options d,u,r,s
+ can_submit_commands 1
+ retain_status_information 1
+ service_notification_commands service_sys_logger
+ host_notification_commands host_sys_logger
+ }
+
+###############################################################################
+###############################################################################
+#
+# CONTACT GROUPS
+#
+###############################################################################
+###############################################################################
+
+# We only have one contact in this simple configuration file, so there is
+# no need to create more than one contact group.
+
+define contactgroup {
+ contactgroup_name admins
+ alias Nagios Administrators
+ members {{nagios_web_login}},sys_logger
+}
http://git-wip-us.apache.org/repos/asf/ambari/blob/65aec661/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
new file mode 100644
index 0000000..e47a09e
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
@@ -0,0 +1,114 @@
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+#
+
+{% if env.system.platform != "suse" %}
+# 'check_cpu' check remote cpu load
+define command {
+ command_name check_cpu
+ command_line $USER1$/check_cpu.pl -H $HOSTADDRESS$ -C hadoop -w $ARG1$ -c $ARG2$
+ }
+{% endif %}
+
+# Check data node storage full
+define command {
+ command_name check_datanode_storage
+ command_line php $USER1$/check_datanode_storage.php -h $HOSTADDRESS$ -p $ARG1$ -w $ARG2$ -c $ARG3$ -e $ARG4$ -k $ARG5$ -r $ARG6$ -t $ARG7$ -s $ARG8$
+ }
+
+define command{
+ command_name check_hdfs_blocks
+ command_line php $USER1$/check_hdfs_blocks.php -h $ARG1$ -p $ARG2$ -w $ARG3$ -c $ARG4$ -s $ARG5$ -e $ARG6$ -k $ARG7$ -r $ARG8$ -t $ARG9$ -u $ARG10$
+ }
+
+define command{
+ command_name check_hdfs_capacity
+ command_line php $USER1$/check_hdfs_capacity.php -h $ARG1$ -p $ARG2$ -w $ARG3$ -c $ARG4$ -e $ARG5$ -k $ARG6$ -r $ARG7$ -t $ARG8$ -s $ARG9$
+ }
+
+define command{
+ command_name check_aggregate
+ command_line php $USER1$/check_aggregate.php -f /var/nagios/status.dat -s 1 -t service -n $ARG1$ -w $ARG2$ -c $ARG3$
+ }
+
+define command{
+ command_name check_rpcq_latency
+ command_line php $USER1$/check_rpcq_latency.php -h $HOSTADDRESS$ -p $ARG2$ -n $ARG1$ -w $ARG3$ -c $ARG4$ -e $ARG5$ -k $ARG6$ -r $ARG7$ -t $ARG8$ -s $ARG9$
+ }
+
+define command{
+ command_name check_nagios
+ command_line $USER1$/check_nagios -e $ARG1$ -F $ARG2$ -C $ARG3$
+ }
+
+define command{
+ command_name check_webui
+ command_line $USER1$/check_webui.sh $ARG1$ $HOSTADDRESS$ $ARG2$
+ }
+
+define command{
+ command_name check_name_dir_status
+ command_line php $USER1$/check_name_dir_status.php -h $HOSTADDRESS$ -p $ARG1$ -e $ARG2$ -k $ARG3$ -r $ARG4$ -t $ARG5$ -s $ARG6$
+ }
+
+define command{
+ command_name check_oozie_status
+ command_line $USER1$/check_oozie_status.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$ $ARG5$ $ARG6$ $ARG7$
+ }
+
+define command{
+ command_name check_templeton_status
+ command_line $USER1$/check_templeton_status.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$ $ARG5$ $ARG6$ $ARG7$
+ }
+
+define command{
+ command_name check_hive_metastore_status
+ command_line $USER1$/check_hive_metastore_status.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$ $ARG5$ $ARG6$ $ARG7$
+ }
+define command{
+ command_name check_hue_status
+ command_line $USER1$/check_hue_status.sh
+ }
+
+define command{
+ command_name check_mapred_local_dir_used_space
+ command_line $USER1$/check_mapred_local_dir_used.sh $ARG1$ $ARG2$
+ }
+
+define command{
+ command_name check_namenodes_ha
+ command_line $USER1$/check_namenodes_ha.sh $ARG1$ $ARG2$
+ }
+
+define command{
+ command_name check_nodemanager_health
+ command_line $USER1$/check_nodemanager_health.sh $HOSTADDRESS$ $ARG1$
+ }
+
+define command{
+ command_name host_sys_logger
+ command_line $USER1$/sys_logger.py $HOSTSTATETYPE$ $HOSTATTEMPT$ $HOSTSTATE$ "Host::Ping" "Event Host=$HOSTADDRESS$($HOSTSTATE$), $HOSTOUTPUT$ $LONGHOSTOUTPUT$"
+ }
+
+define command{
+ command_name service_sys_logger
+ command_line $USER1$/sys_logger.py $SERVICESTATETYPE$ $SERVICEATTEMPT$ $SERVICESTATE$ "$SERVICEDESC$" "Event Host=$HOSTADDRESS$ Service Description=$SERVICEDESC$($SERVICESTATE$), $SERVICEOUTPUT$ $LONGSERVICEOUTPUT$"
+ }
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/ambari/blob/65aec661/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-hostgroups.cfg.j2
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-hostgroups.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-hostgroups.cfg.j2
new file mode 100644
index 0000000..2bcbf7c
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-hostgroups.cfg.j2
@@ -0,0 +1,15 @@
+{% for name, hosts in hostgroup_defs.iteritems() %}
+{% if hosts %}
+define hostgroup {
+ hostgroup_name {{name}}
+ alias {{name}}
+ members {{','.join(hosts)}}
+}
+{% endif %}
+{% endfor %}
+
+define hostgroup {
+ hostgroup_name all-servers
+ alias All Servers
+ members {{','.join(all_hosts)}}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/ambari/blob/65aec661/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-hosts.cfg.j2
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-hosts.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-hosts.cfg.j2
new file mode 100644
index 0000000..62555d4
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-hosts.cfg.j2
@@ -0,0 +1,16 @@
+{% for host in all_hosts %}
+define host {
+ alias {{host}}
+ host_name {{host}}
+ use linux-server
+ address {{host}}
+ check_interval 0.25
+ retry_interval 0.25
+ max_check_attempts 4
+ notifications_enabled 1
+ first_notification_delay 0 # Send notification soon after change in the hard state
+ notification_interval 0 # Send the notification once
+ notification_options d,u,r
+}
+
+{% endfor %}
http://git-wip-us.apache.org/repos/asf/ambari/blob/65aec661/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2
new file mode 100644
index 0000000..0101ce6
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2
@@ -0,0 +1,80 @@
+{% if hostgroup_defs['namenode'] or
+ hostgroup_defs['snamenode'] or
+ hostgroup_defs['slaves'] %}
+define servicegroup {
+ servicegroup_name HDFS
+ alias HDFS Checks
+}
+{% endif %}
+{%if hostgroup_defs['jobtracker'] or
+ hostgroup_defs['historyserver2']-%}
+define servicegroup {
+ servicegroup_name MAPREDUCE
+ alias MAPREDUCE Checks
+}
+{% endif %}
+{%if hostgroup_defs['resourcemanager'] or
+ hostgroup_defs['nodemanagers'] %}
+define servicegroup {
+ servicegroup_name YARN
+ alias YARN Checks
+}
+{% endif %}
+{%if hostgroup_defs['flume-servers'] %}
+define servicegroup {
+ servicegroup_name FLUME
+ alias FLUME Checks
+}
+{% endif %}
+{%if hostgroup_defs['hbasemasters'] %}
+define servicegroup {
+ servicegroup_name HBASE
+ alias HBASE Checks
+}
+{% endif %}
+{% if hostgroup_defs['oozie-server'] %}
+define servicegroup {
+ servicegroup_name OOZIE
+ alias OOZIE Checks
+}
+{% endif %}
+{% if hostgroup_defs['webhcat-server'] %}
+define servicegroup {
+ servicegroup_name WEBHCAT
+ alias WEBHCAT Checks
+}
+{% endif %}
+{% if hostgroup_defs['nagios-server'] %}
+define servicegroup {
+ servicegroup_name NAGIOS
+ alias NAGIOS Checks
+}
+{% endif %}
+{% if hostgroup_defs['ganglia-server'] %}
+define servicegroup {
+ servicegroup_name GANGLIA
+ alias GANGLIA Checks
+}
+{% endif %}
+{% if hostgroup_defs['hiveserver'] %}
+define servicegroup {
+ servicegroup_name HIVE-METASTORE
+ alias HIVE-METASTORE Checks
+}
+{% endif %}
+{% if hostgroup_defs['zookeeper-servers'] %}
+define servicegroup {
+ servicegroup_name ZOOKEEPER
+ alias ZOOKEEPER Checks
+}
+{% endif %}
+define servicegroup {
+ servicegroup_name AMBARI
+ alias AMBARI Checks
+}
+{% if hostgroup_defs['hue-server'] %}
+define servicegroup {
+ servicegroup_name HUE
+ alias HUE Checks
+}
+{% endif %}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/ambari/blob/65aec661/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-services.cfg.j2
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-services.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-services.cfg.j2
new file mode 100644
index 0000000..96fce7c
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0._/services/NAGIOS/package/templates/hadoop-services.cfg.j2
@@ -0,0 +1,654 @@
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+
+{# TODO: Look for { or } in created file #}
+# NAGIOS SERVER Check (status log update)
+{% if hostgroup_defs['nagios-server'] %}
+define service {
+ name hadoop-service
+ use generic-service
+ notification_options w,u,c,r,f,s
+ first_notification_delay 0
+ notification_interval 0 # Send the notification once
+ contact_groups admins
+ notifications_enabled 1
+ event_handler_enabled 1
+ register 0
+}
+
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description NAGIOS::Nagios status log freshness
+ servicegroups NAGIOS
+ check_command check_nagios!10!/var/nagios/status.dat!{{nagios_lookup_daemon_str}}
+ normal_check_interval 5
+ retry_check_interval 0.5
+ max_check_attempts 2
+}
+
+# NAGIOS SERVER HDFS Checks
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description HDFS::Percent DataNodes with space available
+ servicegroups HDFS
+ check_command check_aggregate!"DATANODE::DataNode space"!10%!30%
+ normal_check_interval 2
+ retry_check_interval 1
+ max_check_attempts 1
+}
+
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description HDFS::Percent DataNodes live
+ servicegroups HDFS
+ check_command check_aggregate!"DATANODE::DataNode process"!10%!30%
+ normal_check_interval 0.5
+ retry_check_interval 0.25
+ max_check_attempts 3
+}
+{# used only for HDP2 #}
+{% if hostgroup_defs['namenode'] and dfs_ha_enabled %}
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description HDFS::NameNode HA Healthy
+ servicegroups HDFS
+ check_command check_namenodes_ha!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }}
+ normal_check_interval 0.5
+ retry_check_interval 0.25
+ max_check_attempts 5
+}
+{% endif %}
+
+# AMBARI AGENT Checks
+{% for hostname in all_hosts %}
+define service {
+ host_name {{ hostname }}
+ use hadoop-service
+ service_description AMBARI::Ambari Agent process
+ servicegroups AMBARI
+ check_command check_tcp!{{all_ping_ports[loop.index-1]}}!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+
+{% endfor %}
+
+# NAGIOS SERVER ZOOKEEPER Checks
+{% if hostgroup_defs['zookeeper-servers'] %}
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description ZOOKEEPER::Percent ZooKeeper Servers live
+ servicegroups ZOOKEEPER
+ check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process"!35%!70%
+ normal_check_interval 0.5
+ retry_check_interval 0.25
+ max_check_attempts 3
+}
+{% endif %}
+
+# NAGIOS SERVER HBASE Checks
+{% if hostgroup_defs['hbasemasters'] %}
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description HBASE::Percent RegionServers live
+ servicegroups HBASE
+ check_command check_aggregate!"REGIONSERVER::RegionServer process"!10%!30%
+ normal_check_interval 0.5
+ retry_check_interval 0.25
+ max_check_attempts 3
+}
+{% endif %}
+{% endif %}
+
+
+
+# GANGLIA SERVER Checks
+{% if hostgroup_defs['ganglia-server'] %}
+define service {
+ hostgroup_name ganglia-server
+ use hadoop-service
+ service_description GANGLIA::Ganglia Server process
+ servicegroups GANGLIA
+ check_command check_tcp!{{ ganglia_port }}!-w 1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+
+define service {
+ hostgroup_name ganglia-server
+ use hadoop-service
+ service_description GANGLIA::Ganglia Monitor process for Slaves
+ servicegroups GANGLIA
+ check_command check_tcp!{{ ganglia_collector_slaves_port }}!-w 1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+
+define service {
+ hostgroup_name ganglia-server
+ use hadoop-service
+ service_description GANGLIA::Ganglia Monitor process for NameNode
+ servicegroups GANGLIA
+ check_command check_tcp!{{ ganglia_collector_namenode_port }}!-w 1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+
+{% if hostgroup_defs['jobtracker'] %}
+define service {
+ hostgroup_name ganglia-server
+ use hadoop-service
+ service_description GANGLIA::Ganglia Monitor process for JobTracker
+ servicegroups GANGLIA
+ check_command check_tcp!{{ ganglia_collector_jobtracker_port }}!-w 1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+{% endif %}
+
+{% if hostgroup_defs['hbasemasters'] %}
+define service {
+ hostgroup_name ganglia-server
+ use hadoop-service
+ service_description GANGLIA::Ganglia Monitor process for HBase Master
+ servicegroups GANGLIA
+ check_command check_tcp!{{ ganglia_collector_hbase_port }}!-w 1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+{% endif %}
+
+{% if hostgroup_defs['resourcemanager'] %}
+define service {
+ hostgroup_name ganglia-server
+ use hadoop-service
+ service_description GANGLIA::Ganglia Monitor process for ResourceManager
+ servicegroups GANGLIA
+ check_command check_tcp!{{ ganglia_collector_rm_port }}!-w 1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+{% endif %}
+
+{% if hostgroup_defs['historyserver2'] %}
+define service {
+ hostgroup_name ganglia-server
+ use hadoop-service
+ service_description GANGLIA::Ganglia Monitor process for HistoryServer
+ servicegroups GANGLIA
+ check_command check_tcp!{{ ganglia_collector_hs_port }}!-w 1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+{% endif %}
+
+{% endif %}
+
+{% if hostgroup_defs['snamenode'] %}
+# Secondary namenode checks
+define service {
+ hostgroup_name snamenode
+ use hadoop-service
+ service_description NAMENODE::Secondary NameNode process
+ servicegroups HDFS
+ check_command check_tcp!{{ snamenode_port }}!-w 1 -c 1
+ normal_check_interval 0.5
+ retry_check_interval 0.25
+ max_check_attempts 3
+}
+{% endif %}
+
+
+{% if hostgroup_defs['namenode'] %}
+# HDFS Checks
+{% for namenode_hostname in namenode_host %}
+{# TODO: check if we can get rid of str, lower #}
+define service {
+ host_name {{ namenode_hostname }}
+ use hadoop-service
+ service_description NAMENODE::NameNode edit logs directory status on {{ namenode_hostname }}
+ servicegroups HDFS
+ check_command check_name_dir_status!{{ namenode_port }}!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
+ normal_check_interval 0.5
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+
+{% if env.system.platform != "suse" %}
+define service {
+ host_name {{ namenode_hostname }}
+ use hadoop-service
+ service_description NAMENODE::NameNode host CPU utilization on {{ namenode_hostname }}
+ servicegroups HDFS
+ check_command check_cpu!200%!250%
+ normal_check_interval 5
+ retry_check_interval 2
+ max_check_attempts 5
+}
+{% endif %}
+
+define service {
+ host_name {{ namenode_hostname }}
+ use hadoop-service
+ service_description NAMENODE::NameNode Web UI on {{ namenode_hostname }}
+ servicegroups HDFS
+ check_command check_webui!namenode!{{ namenode_port }}
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
+
+define service {
+ host_name {{ namenode_hostname }}
+ use hadoop-service
+ service_description NAMENODE::NameNode process on {{ namenode_hostname }}
+ servicegroups HDFS
+ check_command check_tcp!{{ namenode_metadata_port }}!-w 1 -c 1
+ normal_check_interval 0.5
+ retry_check_interval 0.25
+ max_check_attempts 3
+}
+
+define service {
+ host_name {{ namenode_hostname }}
+ use hadoop-service
+ service_description HDFS::NameNode RPC latency on {{ namenode_hostname }}
+ servicegroups HDFS
+ check_command check_rpcq_latency!NameNode!{{ namenode_port }}!3000!5000!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
+ normal_check_interval 5
+ retry_check_interval 1
+ max_check_attempts 5
+}
+
+{% endfor %}
+
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description HDFS::Blocks health
+ servicegroups HDFS
+ check_command check_hdfs_blocks!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }}!0%!0%!{{ nn_metrics_property }}!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
+ normal_check_interval 2
+ retry_check_interval 1
+ max_check_attempts 1
+}
+
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description HDFS::HDFS capacity utilization
+ servicegroups HDFS
+ check_command check_hdfs_capacity!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }}!80%!90%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
+ normal_check_interval 10
+ retry_check_interval 1
+ max_check_attempts 1
+}
+
+{% endif %}
+
+# MAPREDUCE Checks
+{# On HDP1 here are jobtracker and tasktracker alters #}
+
+{% if hostgroup_defs['resourcemanager'] %}
+# YARN::RESOURCEMANAGER Checks
+define service {
+ hostgroup_name resourcemanager
+ use hadoop-service
+ service_description RESOURCEMANAGER::ResourceManager Web UI
+ servicegroups YARN
+ check_command check_webui!resourcemanager!{{ rm_port }}
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
+
+{% if env.system.platform != "suse" %}
+define service {
+ hostgroup_name resourcemanager
+ use hadoop-service
+ service_description RESOURCEMANAGER::ResourceManager CPU utilization
+ servicegroups YARN
+ check_command check_cpu!200%!250%
+ normal_check_interval 5
+ retry_check_interval 2
+ max_check_attempts 5
+}
+{% endif %}
+
+define service {
+ hostgroup_name resourcemanager
+ use hadoop-service
+ service_description RESOURCEMANAGER::ResourceManager RPC latency
+ servicegroups YARN
+ check_command check_rpcq_latency!ResourceManager!{{ rm_port }}!3000!5000!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
+ normal_check_interval 5
+ retry_check_interval 1
+ max_check_attempts 5
+}
+
+define service {
+ hostgroup_name resourcemanager
+ use hadoop-service
+ service_description RESOURCEMANAGER::ResourceManager process
+ servicegroups YARN
+ check_command check_tcp!{{ rm_port }}!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+{% endif %}
+
+{% if hostgroup_defs['nodemanagers'] %}
+# YARN::NODEMANAGER Checks
+define service {
+ hostgroup_name nodemanagers
+ use hadoop-service
+ service_description NODEMANAGER::NodeManager process
+ servicegroups YARN
+ check_command check_tcp!{{ nm_port }}!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+
+define service {
+ hostgroup_name nodemanagers
+ use hadoop-service
+ service_description NODEMANAGER::NodeManager health
+ servicegroups YARN
+ check_command check_nodemanager_health!{{ nm_port }}!{{ str(security_enabled).lower() }}!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description NODEMANAGER::Percent NodeManagers live
+ servicegroups YARN
+ check_command check_aggregate!"NODEMANAGER::NodeManager process"!10%!30%
+ normal_check_interval 0.5
+ retry_check_interval 0.25
+ max_check_attempts 3
+}
+{% endif %}
+
+{% if hostgroup_defs['historyserver2'] %}
+# MAPREDUCE::JOBHISTORY Checks
+define service {
+ hostgroup_name historyserver2
+ use hadoop-service
+ service_description JOBHISTORY::HistoryServer Web UI
+ servicegroups MAPREDUCE
+ check_command check_webui!historyserver2!{{ hs_port }}
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
+
+{% if env.system.platform != "suse" %}
+define service {
+ hostgroup_name historyserver2
+ use hadoop-service
+ service_description JOBHISTORY::HistoryServer CPU utilization
+ servicegroups MAPREDUCE
+ check_command check_cpu!200%!250%
+ normal_check_interval 5
+ retry_check_interval 2
+ max_check_attempts 5
+}
+{% endif %}
+
+define service {
+ hostgroup_name historyserver2
+ use hadoop-service
+ service_description JOBHISTORY::HistoryServer RPC latency
+ servicegroups MAPREDUCE
+ check_command check_rpcq_latency!JobHistoryServer!{{ hs_port }}!3000!5000!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
+ normal_check_interval 5
+ retry_check_interval 1
+ max_check_attempts 5
+}
+
+define service {
+ hostgroup_name historyserver2
+ use hadoop-service
+ service_description JOBHISTORY::HistoryServer process
+ servicegroups MAPREDUCE
+ check_command check_tcp!{{ hs_port }}!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+
+{% endif %}
+
+{% if hostgroup_defs['journalnodes'] %}
+# Journalnode checks
+define service {
+ hostgroup_name journalnodes
+ use hadoop-service
+ service_description JOURNALNODE::JournalNode process
+ servicegroups HDFS
+ check_command check_tcp!{{ journalnode_port }}!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+
+{% if dfs_ha_enabled %}
+define service {
+ hostgroup_name nagios-server
+ use hadoop-service
+ service_description HDFS::Percent JournalNodes live
+ servicegroups HDFS
+ check_command check_aggregate!"JOURNALNODE::JournalNode process"!33%!50%
+ normal_check_interval 0.5
+ retry_check_interval 0.25
+ max_check_attempts 3
+}
+{% endif %}
+{% endif %}
+
+{% if hostgroup_defs['slaves'] %}
+# HDFS::DATANODE Checks
+define service {
+ hostgroup_name slaves
+ use hadoop-service
+ service_description DATANODE::DataNode process
+ servicegroups HDFS
+ check_command check_tcp!{{datanode_port}}!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+
+define service {
+ hostgroup_name slaves
+ use hadoop-service
+ service_description DATANODE::DataNode space
+ servicegroups HDFS
+ check_command check_datanode_storage!{{ datanode_port }}!90%!90%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
+ normal_check_interval 5
+ retry_check_interval 1
+ max_check_attempts 2
+}
+
+{% endif %}
+
+{% if hostgroup_defs['flume-servers'] %}
+# FLUME Checks
+define service {
+ hostgroup_name flume-servers
+ use hadoop-service
+ service_description FLUME::Flume Agent process
+ servicegroups FLUME
+ check_command check_tcp!{{ flume_port }}!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+{% endif %}
+
+
+{% if hostgroup_defs['zookeeper-servers'] %}
+# ZOOKEEPER Checks
+define service {
+ hostgroup_name zookeeper-servers
+ use hadoop-service
+ service_description ZOOKEEPER::ZooKeeper Server process
+ servicegroups ZOOKEEPER
+ check_command check_tcp!{{ clientPort }}!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+{% endif %}
+
+{% if hostgroup_defs['hbasemasters'] %}
+# HBASE::REGIONSERVER Checks
+define service {
+ hostgroup_name region-servers
+ use hadoop-service
+ service_description REGIONSERVER::RegionServer process
+ servicegroups HBASE
+ check_command check_tcp!{{ hbase_rs_port }}!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+
+{# HBASE:: MASTER Checks
+# define service {
+# hostgroup_name hbasemasters
+# use hadoop-service
+# service_description HBASEMASTER::HBase Master Web UI
+# servicegroups HBASE
+# check_command check_webui!hbase!{{ hbase_master_port }}
+# normal_check_interval 1
+# retry_check_interval 1
+# max_check_attempts 3
+# #}
+{% for hbasemaster in hbase_master_hosts %}
+{% if env.system.platform != "suse" %}
+define service {
+ host_name {{ hbasemaster }}
+ use hadoop-service
+ service_description HBASEMASTER::HBase Master CPU utilization on {{ hbasemaster }}
+ servicegroups HBASE
+ check_command check_cpu!200%!250%
+ normal_check_interval 5
+ retry_check_interval 2
+ max_check_attempts 5
+}
+{% endif %}
+define service {
+ host_name {{ hbasemaster }}
+ use hadoop-service
+ service_description HBASEMASTER::HBase Master process on {{ hbasemaster }}
+ servicegroups HBASE
+ check_command check_tcp!{{ hbase_master_rpc_port }}!-w 1 -c 1
+ normal_check_interval 0.5
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+{% endfor %}
+{% endif %}
+
+{% if hostgroup_defs['hiveserver'] %}
+# HIVE Metastore check
+define service {
+ hostgroup_name hiveserver
+ use hadoop-service
+ service_description HIVE-METASTORE::Hive Metastore status
+ servicegroups HIVE-METASTORE
+ {% if security_enabled %}
+ check_command check_hive_metastore_status!{{ hive_metastore_port }}!{{ java64_home }}!true!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}
+ {% else %}
+ check_command check_hive_metastore_status!{{ hive_metastore_port }}!{{ java64_home }}!false
+ {% endif %}
+ normal_check_interval 0.5
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+{% endif %}
+{% if hostgroup_defs['oozie-server'] %}
+# Oozie check
+define service {
+ hostgroup_name oozie-server
+ use hadoop-service
+ service_description OOZIE::Oozie Server status
+ servicegroups OOZIE
+ {% if security_enabled %}
+ check_command check_oozie_status!{{ oozie_server_port }}!{{ java64_home }}!true!{{ nagios_keytab_path }}>!{{ nagios_principal_name }}!{{ kinit_path_local }}
+ {% else %}
+ check_command check_oozie_status!{{ oozie_server_port }}!{{ java64_home }}!false
+ {% endif %}
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
+{% endif %}
+{% if hostgroup_defs['webhcat-server'] %}
+# WEBHCAT check
+define service {
+ hostgroup_name webhcat-server
+ use hadoop-service
+ service_description WEBHCAT::WebHCat Server status
+ servicegroups WEBHCAT
+ {% if security_enabled %}
+ check_command check_templeton_status!{{ templeton_port }}!v1!{{ str(security_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}
+ {% else %}
+ check_command check_templeton_status!{{ templeton_port }}!v1!false
+ {% endif %}
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+{% endif %}
+
+{% if hostgroup_defs['hue-server'] %}
+define service {
+ hostgroup_name hue-server
+ use hadoop-service
+ service_description HUE::Hue Server status
+ servicegroups HUE
+ check_command check_hue_status
+ normal_check_interval 100
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+{% endif %}
+